LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoadExtract(Instruction &I);
133 bool scalarizeExtExtract(Instruction &I);
134 bool foldConcatOfBoolMasks(Instruction &I);
135 bool foldPermuteOfBinops(Instruction &I);
136 bool foldShuffleOfBinops(Instruction &I);
137 bool foldShuffleOfSelects(Instruction &I);
138 bool foldShuffleOfCastops(Instruction &I);
139 bool foldShuffleOfShuffles(Instruction &I);
140 bool foldShuffleOfIntrinsics(Instruction &I);
141 bool foldShuffleToIdentity(Instruction &I);
142 bool foldShuffleFromReductions(Instruction &I);
143 bool foldShuffleChainsToReduce(Instruction &I);
144 bool foldCastFromReductions(Instruction &I);
145 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
146 bool foldInterleaveIntrinsics(Instruction &I);
147 bool shrinkType(Instruction &I);
148 bool shrinkLoadForShuffles(Instruction &I);
149 bool shrinkPhiOfShuffles(Instruction &I);
150
151 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
152 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
153 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
154 Old.replaceAllUsesWith(&New);
155 if (auto *NewI = dyn_cast<Instruction>(&New)) {
156 New.takeName(&Old);
157 Worklist.pushUsersToWorkList(*NewI);
158 Worklist.pushValue(NewI);
159 }
160 if (Erase && isInstructionTriviallyDead(&Old)) {
161 eraseInstruction(Old);
162 } else {
163 Worklist.push(&Old);
164 }
165 }
166
167 void eraseInstruction(Instruction &I) {
168 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
169 SmallVector<Value *> Ops(I.operands());
170 Worklist.remove(&I);
171 I.eraseFromParent();
172
173 // Push remaining users of the operands and then the operand itself - allows
174 // further folds that were hindered by OneUse limits.
175 SmallPtrSet<Value *, 4> Visited;
176 for (Value *Op : Ops) {
177 if (!Visited.contains(Op)) {
178 if (auto *OpI = dyn_cast<Instruction>(Op)) {
180 OpI, nullptr, nullptr, [&](Value *V) {
181 if (auto *I = dyn_cast<Instruction>(V)) {
182 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
183 Worklist.remove(I);
184 if (I == NextInst)
185 NextInst = NextInst->getNextNode();
186 Visited.insert(I);
187 }
188 }))
189 continue;
190 Worklist.pushUsersToWorkList(*OpI);
191 Worklist.pushValue(OpI);
192 }
193 }
194 }
195 }
196};
197} // namespace
198
199/// Return the source operand of a potentially bitcasted value. If there is no
200/// bitcast, return the input value itself.
202 while (auto *BitCast = dyn_cast<BitCastInst>(V))
203 V = BitCast->getOperand(0);
204 return V;
205}
206
207static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
208 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
209 // The widened load may load data from dirty regions or create data races
210 // non-existent in the source.
211 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
212 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
214 return false;
215
216 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
217 // sure we have all of our type-based constraints in place for this target.
218 Type *ScalarTy = Load->getType()->getScalarType();
219 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
220 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
221 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
222 ScalarSize % 8 != 0)
223 return false;
224
225 return true;
226}
227
228bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
229 // Match insert into fixed vector of scalar value.
230 // TODO: Handle non-zero insert index.
231 Value *Scalar;
232 if (!match(&I,
234 return false;
235
236 // Optionally match an extract from another vector.
237 Value *X;
238 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
239 if (!HasExtract)
240 X = Scalar;
241
242 auto *Load = dyn_cast<LoadInst>(X);
243 if (!canWidenLoad(Load, TTI))
244 return false;
245
246 Type *ScalarTy = Scalar->getType();
247 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
248 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
249
250 // Check safety of replacing the scalar load with a larger vector load.
251 // We use minimal alignment (maximum flexibility) because we only care about
252 // the dereferenceable region. When calculating cost and creating a new op,
253 // we may use a larger value based on alignment attributes.
254 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
255 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
256
257 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
258 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
259 unsigned OffsetEltIndex = 0;
260 Align Alignment = Load->getAlign();
261 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
262 &DT)) {
263 // It is not safe to load directly from the pointer, but we can still peek
264 // through gep offsets and check if it safe to load from a base address with
265 // updated alignment. If it is, we can shuffle the element(s) into place
266 // after loading.
267 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
268 APInt Offset(OffsetBitWidth, 0);
270
271 // We want to shuffle the result down from a high element of a vector, so
272 // the offset must be positive.
273 if (Offset.isNegative())
274 return false;
275
276 // The offset must be a multiple of the scalar element to shuffle cleanly
277 // in the element's size.
278 uint64_t ScalarSizeInBytes = ScalarSize / 8;
279 if (Offset.urem(ScalarSizeInBytes) != 0)
280 return false;
281
282 // If we load MinVecNumElts, will our target element still be loaded?
283 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
284 if (OffsetEltIndex >= MinVecNumElts)
285 return false;
286
287 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
288 &DT))
289 return false;
290
291 // Update alignment with offset value. Note that the offset could be negated
292 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
293 // negation does not change the result of the alignment calculation.
294 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
295 }
296
297 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
298 // Use the greater of the alignment on the load or its source pointer.
299 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
300 Type *LoadTy = Load->getType();
301 unsigned AS = Load->getPointerAddressSpace();
302 InstructionCost OldCost =
303 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
304 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
305 OldCost +=
306 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
307 /* Insert */ true, HasExtract, CostKind);
308
309 // New pattern: load VecPtr
310 InstructionCost NewCost =
311 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
312 // Optionally, we are shuffling the loaded vector element(s) into place.
313 // For the mask set everything but element 0 to undef to prevent poison from
314 // propagating from the extra loaded memory. This will also optionally
315 // shrink/grow the vector from the loaded size to the output size.
316 // We assume this operation has no cost in codegen if there was no offset.
317 // Note that we could use freeze to avoid poison problems, but then we might
318 // still need a shuffle to change the vector size.
319 auto *Ty = cast<FixedVectorType>(I.getType());
320 unsigned OutputNumElts = Ty->getNumElements();
321 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
322 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
323 Mask[0] = OffsetEltIndex;
324 if (OffsetEltIndex)
325 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
326 CostKind);
327
328 // We can aggressively convert to the vector form because the backend can
329 // invert this transform if it does not result in a performance win.
330 if (OldCost < NewCost || !NewCost.isValid())
331 return false;
332
333 // It is safe and potentially profitable to load a vector directly:
334 // inselt undef, load Scalar, 0 --> load VecPtr
335 IRBuilder<> Builder(Load);
336 Value *CastedPtr =
337 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
338 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
339 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
340
341 replaceValue(I, *VecLd);
342 ++NumVecLoad;
343 return true;
344}
345
346/// If we are loading a vector and then inserting it into a larger vector with
347/// undefined elements, try to load the larger vector and eliminate the insert.
348/// This removes a shuffle in IR and may allow combining of other loaded values.
349bool VectorCombine::widenSubvectorLoad(Instruction &I) {
350 // Match subvector insert of fixed vector.
351 auto *Shuf = cast<ShuffleVectorInst>(&I);
352 if (!Shuf->isIdentityWithPadding())
353 return false;
354
355 // Allow a non-canonical shuffle mask that is choosing elements from op1.
356 unsigned NumOpElts =
357 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
358 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
359 return M >= (int)(NumOpElts);
360 });
361
362 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
363 if (!canWidenLoad(Load, TTI))
364 return false;
365
366 // We use minimal alignment (maximum flexibility) because we only care about
367 // the dereferenceable region. When calculating cost and creating a new op,
368 // we may use a larger value based on alignment attributes.
369 auto *Ty = cast<FixedVectorType>(I.getType());
370 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
371 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
372 Align Alignment = Load->getAlign();
373 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
374 return false;
375
376 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
377 Type *LoadTy = Load->getType();
378 unsigned AS = Load->getPointerAddressSpace();
379
380 // Original pattern: insert_subvector (load PtrOp)
381 // This conservatively assumes that the cost of a subvector insert into an
382 // undef value is 0. We could add that cost if the cost model accurately
383 // reflects the real cost of that operation.
384 InstructionCost OldCost =
385 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
386
387 // New pattern: load PtrOp
388 InstructionCost NewCost =
389 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
390
391 // We can aggressively convert to the vector form because the backend can
392 // invert this transform if it does not result in a performance win.
393 if (OldCost < NewCost || !NewCost.isValid())
394 return false;
395
396 IRBuilder<> Builder(Load);
397 Value *CastedPtr =
398 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
399 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
400 replaceValue(I, *VecLd);
401 ++NumVecLoad;
402 return true;
403}
404
405/// Determine which, if any, of the inputs should be replaced by a shuffle
406/// followed by extract from a different index.
407ExtractElementInst *VectorCombine::getShuffleExtract(
408 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
409 unsigned PreferredExtractIndex = InvalidIndex) const {
410 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
411 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
412 assert(Index0C && Index1C && "Expected constant extract indexes");
413
414 unsigned Index0 = Index0C->getZExtValue();
415 unsigned Index1 = Index1C->getZExtValue();
416
417 // If the extract indexes are identical, no shuffle is needed.
418 if (Index0 == Index1)
419 return nullptr;
420
421 Type *VecTy = Ext0->getVectorOperand()->getType();
422 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
423 InstructionCost Cost0 =
424 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
425 InstructionCost Cost1 =
426 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
427
428 // If both costs are invalid no shuffle is needed
429 if (!Cost0.isValid() && !Cost1.isValid())
430 return nullptr;
431
432 // We are extracting from 2 different indexes, so one operand must be shuffled
433 // before performing a vector operation and/or extract. The more expensive
434 // extract will be replaced by a shuffle.
435 if (Cost0 > Cost1)
436 return Ext0;
437 if (Cost1 > Cost0)
438 return Ext1;
439
440 // If the costs are equal and there is a preferred extract index, shuffle the
441 // opposite operand.
442 if (PreferredExtractIndex == Index0)
443 return Ext1;
444 if (PreferredExtractIndex == Index1)
445 return Ext0;
446
447 // Otherwise, replace the extract with the higher index.
448 return Index0 > Index1 ? Ext0 : Ext1;
449}
450
451/// Compare the relative costs of 2 extracts followed by scalar operation vs.
452/// vector operation(s) followed by extract. Return true if the existing
453/// instructions are cheaper than a vector alternative. Otherwise, return false
454/// and if one of the extracts should be transformed to a shufflevector, set
455/// \p ConvertToShuffle to that extract instruction.
456bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
457 ExtractElementInst *Ext1,
458 const Instruction &I,
459 ExtractElementInst *&ConvertToShuffle,
460 unsigned PreferredExtractIndex) {
461 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
462 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
463 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
464
465 unsigned Opcode = I.getOpcode();
466 Value *Ext0Src = Ext0->getVectorOperand();
467 Value *Ext1Src = Ext1->getVectorOperand();
468 Type *ScalarTy = Ext0->getType();
469 auto *VecTy = cast<VectorType>(Ext0Src->getType());
470 InstructionCost ScalarOpCost, VectorOpCost;
471
472 // Get cost estimates for scalar and vector versions of the operation.
473 bool IsBinOp = Instruction::isBinaryOp(Opcode);
474 if (IsBinOp) {
475 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
476 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
477 } else {
478 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
479 "Expected a compare");
480 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
481 ScalarOpCost = TTI.getCmpSelInstrCost(
482 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
483 VectorOpCost = TTI.getCmpSelInstrCost(
484 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
485 }
486
487 // Get cost estimates for the extract elements. These costs will factor into
488 // both sequences.
489 unsigned Ext0Index = Ext0IndexC->getZExtValue();
490 unsigned Ext1Index = Ext1IndexC->getZExtValue();
491
492 InstructionCost Extract0Cost =
493 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
494 InstructionCost Extract1Cost =
495 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
496
497 // A more expensive extract will always be replaced by a splat shuffle.
498 // For example, if Ext0 is more expensive:
499 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
500 // extelt (opcode (splat V0, Ext0), V1), Ext1
501 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
502 // check the cost of creating a broadcast shuffle and shuffling both
503 // operands to element 0.
504 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
505 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
506 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
507
508 // Extra uses of the extracts mean that we include those costs in the
509 // vector total because those instructions will not be eliminated.
510 InstructionCost OldCost, NewCost;
511 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
512 // Handle a special case. If the 2 extracts are identical, adjust the
513 // formulas to account for that. The extra use charge allows for either the
514 // CSE'd pattern or an unoptimized form with identical values:
515 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
516 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
517 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
518 OldCost = CheapExtractCost + ScalarOpCost;
519 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
520 } else {
521 // Handle the general case. Each extract is actually a different value:
522 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
523 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
524 NewCost = VectorOpCost + CheapExtractCost +
525 !Ext0->hasOneUse() * Extract0Cost +
526 !Ext1->hasOneUse() * Extract1Cost;
527 }
528
529 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
530 if (ConvertToShuffle) {
531 if (IsBinOp && DisableBinopExtractShuffle)
532 return true;
533
534 // If we are extracting from 2 different indexes, then one operand must be
535 // shuffled before performing the vector operation. The shuffle mask is
536 // poison except for 1 lane that is being translated to the remaining
537 // extraction lane. Therefore, it is a splat shuffle. Ex:
538 // ShufMask = { poison, poison, 0, poison }
539 // TODO: The cost model has an option for a "broadcast" shuffle
540 // (splat-from-element-0), but no option for a more general splat.
541 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
542 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
544 ShuffleMask[BestInsIndex] = BestExtIndex;
546 VecTy, VecTy, ShuffleMask, CostKind, 0,
547 nullptr, {ConvertToShuffle});
548 } else {
550 VecTy, VecTy, {}, CostKind, 0, nullptr,
551 {ConvertToShuffle});
552 }
553 }
554
555 // Aggressively form a vector op if the cost is equal because the transform
556 // may enable further optimization.
557 // Codegen can reverse this transform (scalarize) if it was not profitable.
558 return OldCost < NewCost;
559}
560
561/// Create a shuffle that translates (shifts) 1 element from the input vector
562/// to a new element location.
563static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
564 unsigned NewIndex, IRBuilderBase &Builder) {
565 // The shuffle mask is poison except for 1 lane that is being translated
566 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
567 // ShufMask = { 2, poison, poison, poison }
568 auto *VecTy = cast<FixedVectorType>(Vec->getType());
569 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
570 ShufMask[NewIndex] = OldIndex;
571 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
572}
573
574/// Given an extract element instruction with constant index operand, shuffle
575/// the source vector (shift the scalar element) to a NewIndex for extraction.
576/// Return null if the input can be constant folded, so that we are not creating
577/// unnecessary instructions.
578static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
579 IRBuilderBase &Builder) {
580 // Shufflevectors can only be created for fixed-width vectors.
581 Value *X = ExtElt->getVectorOperand();
582 if (!isa<FixedVectorType>(X->getType()))
583 return nullptr;
584
585 // If the extract can be constant-folded, this code is unsimplified. Defer
586 // to other passes to handle that.
587 Value *C = ExtElt->getIndexOperand();
588 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
589 if (isa<Constant>(X))
590 return nullptr;
591
592 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
593 NewIndex, Builder);
594 return Shuf;
595}
596
597/// Try to reduce extract element costs by converting scalar compares to vector
598/// compares followed by extract.
599/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
600Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
601 Instruction &I) {
602 assert(isa<CmpInst>(&I) && "Expected a compare");
603
604 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
605 // --> extelt (cmp Pred V0, V1), ExtIndex
606 ++NumVecCmp;
607 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
608 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
609 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
610}
611
612/// Try to reduce extract element costs by converting scalar binops to vector
613/// binops followed by extract.
614/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
615Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
616 Instruction &I) {
617 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
618
619 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
620 // --> extelt (bo V0, V1), ExtIndex
621 ++NumVecBO;
622 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
623 V1, "foldExtExtBinop");
624
625 // All IR flags are safe to back-propagate because any potential poison
626 // created in unused vector elements is discarded by the extract.
627 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
628 VecBOInst->copyIRFlags(&I);
629
630 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
631}
632
633/// Match an instruction with extracted vector operands.
634bool VectorCombine::foldExtractExtract(Instruction &I) {
635 // It is not safe to transform things like div, urem, etc. because we may
636 // create undefined behavior when executing those on unknown vector elements.
638 return false;
639
640 Instruction *I0, *I1;
641 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
642 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
644 return false;
645
646 Value *V0, *V1;
647 uint64_t C0, C1;
648 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
649 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
650 V0->getType() != V1->getType())
651 return false;
652
653 // If the scalar value 'I' is going to be re-inserted into a vector, then try
654 // to create an extract to that same element. The extract/insert can be
655 // reduced to a "select shuffle".
656 // TODO: If we add a larger pattern match that starts from an insert, this
657 // probably becomes unnecessary.
658 auto *Ext0 = cast<ExtractElementInst>(I0);
659 auto *Ext1 = cast<ExtractElementInst>(I1);
660 uint64_t InsertIndex = InvalidIndex;
661 if (I.hasOneUse())
662 match(I.user_back(),
663 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
664
665 ExtractElementInst *ExtractToChange;
666 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
667 return false;
668
669 Value *ExtOp0 = Ext0->getVectorOperand();
670 Value *ExtOp1 = Ext1->getVectorOperand();
671
672 if (ExtractToChange) {
673 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
674 Value *NewExtOp =
675 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
676 if (!NewExtOp)
677 return false;
678 if (ExtractToChange == Ext0)
679 ExtOp0 = NewExtOp;
680 else
681 ExtOp1 = NewExtOp;
682 }
683
684 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
685 : Ext0->getIndexOperand();
686 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
687 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
688 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
689 Worklist.push(Ext0);
690 Worklist.push(Ext1);
691 replaceValue(I, *NewExt);
692 return true;
693}
694
695/// Try to replace an extract + scalar fneg + insert with a vector fneg +
696/// shuffle.
697bool VectorCombine::foldInsExtFNeg(Instruction &I) {
698 // Match an insert (op (extract)) pattern.
699 Value *DestVec;
700 uint64_t Index;
701 Instruction *FNeg;
702 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
703 m_ConstantInt(Index))))
704 return false;
705
706 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
707 Value *SrcVec;
708 Instruction *Extract;
709 if (!match(FNeg, m_FNeg(m_CombineAnd(
710 m_Instruction(Extract),
711 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
712 return false;
713
714 auto *VecTy = cast<FixedVectorType>(I.getType());
715 auto *ScalarTy = VecTy->getScalarType();
716 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
717 if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
718 return false;
719
720 // Ignore bogus insert/extract index.
721 unsigned NumElts = VecTy->getNumElements();
722 if (Index >= NumElts)
723 return false;
724
725 // We are inserting the negated element into the same lane that we extracted
726 // from. This is equivalent to a select-shuffle that chooses all but the
727 // negated element from the destination vector.
728 SmallVector<int> Mask(NumElts);
729 std::iota(Mask.begin(), Mask.end(), 0);
730 Mask[Index] = Index + NumElts;
731 InstructionCost OldCost =
732 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
733 TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
734
735 // If the extract has one use, it will be eliminated, so count it in the
736 // original cost. If it has more than one use, ignore the cost because it will
737 // be the same before/after.
738 if (Extract->hasOneUse())
739 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
740
741 InstructionCost NewCost =
742 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
744 Mask, CostKind);
745
746 bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
747 // If the lengths of the two vectors are not equal,
748 // we need to add a length-change vector. Add this cost.
749 SmallVector<int> SrcMask;
750 if (NeedLenChg) {
751 SrcMask.assign(NumElts, PoisonMaskElem);
752 SrcMask[Index] = Index;
754 VecTy, SrcVecTy, SrcMask, CostKind);
755 }
756
757 if (NewCost > OldCost)
758 return false;
759
760 Value *NewShuf;
761 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
762 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
763 if (NeedLenChg) {
764 // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
765 Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
766 NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
767 } else {
768 // shuffle DestVec, (fneg SrcVec), Mask
769 NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
770 }
771
772 replaceValue(I, *NewShuf);
773 return true;
774}
775
776/// Try to fold insert(binop(x,y),binop(a,b),idx)
777/// --> binop(insert(x,a,idx),insert(y,b,idx))
778bool VectorCombine::foldInsExtBinop(Instruction &I) {
779 BinaryOperator *VecBinOp, *SclBinOp;
780 uint64_t Index;
781 if (!match(&I,
782 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
783 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
784 return false;
785
786 // TODO: Add support for addlike etc.
787 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
788 if (BinOpcode != SclBinOp->getOpcode())
789 return false;
790
791 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
792 if (!ResultTy)
793 return false;
794
795 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
796 // shuffle?
797
799 TTI.getInstructionCost(VecBinOp, CostKind) +
801 InstructionCost NewCost =
802 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
803 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
804 Index, VecBinOp->getOperand(0),
805 SclBinOp->getOperand(0)) +
806 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
807 Index, VecBinOp->getOperand(1),
808 SclBinOp->getOperand(1));
809
810 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
811 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
812 << "\n");
813 if (NewCost > OldCost)
814 return false;
815
816 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
817 SclBinOp->getOperand(0), Index);
818 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1), Index);
820 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
821
822 // Intersect flags from the old binops.
823 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
824 NewInst->copyIRFlags(VecBinOp);
825 NewInst->andIRFlags(SclBinOp);
826 }
827
828 Worklist.pushValue(NewIns0);
829 Worklist.pushValue(NewIns1);
830 replaceValue(I, *NewBO);
831 return true;
832}
833
834/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
835/// Supports: bitcast, trunc, sext, zext
836bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
837 // Check if this is a bitwise logic operation
838 auto *BinOp = dyn_cast<BinaryOperator>(&I);
839 if (!BinOp || !BinOp->isBitwiseLogicOp())
840 return false;
841
842 // Get the cast instructions
843 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
844 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
845 if (!LHSCast || !RHSCast) {
846 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
847 return false;
848 }
849
850 // Both casts must be the same type
851 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
852 if (CastOpcode != RHSCast->getOpcode())
853 return false;
854
855 // Only handle supported cast operations
856 switch (CastOpcode) {
857 case Instruction::BitCast:
858 case Instruction::Trunc:
859 case Instruction::SExt:
860 case Instruction::ZExt:
861 break;
862 default:
863 return false;
864 }
865
866 Value *LHSSrc = LHSCast->getOperand(0);
867 Value *RHSSrc = RHSCast->getOperand(0);
868
869 // Source types must match
870 if (LHSSrc->getType() != RHSSrc->getType())
871 return false;
872
873 auto *SrcTy = LHSSrc->getType();
874 auto *DstTy = I.getType();
875 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
876 // Other casts only handle vector types with integer elements.
877 if (CastOpcode != Instruction::BitCast &&
878 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
879 return false;
880
881 // Only integer scalar/vector values are legal for bitwise logic operations.
882 if (!SrcTy->getScalarType()->isIntegerTy() ||
883 !DstTy->getScalarType()->isIntegerTy())
884 return false;
885
886 // Cost Check :
887 // OldCost = bitlogic + 2*casts
888 // NewCost = bitlogic + cast
889
890 // Calculate specific costs for each cast with instruction context
892 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
894 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
895
896 InstructionCost OldCost =
897 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
898 LHSCastCost + RHSCastCost;
899
900 // For new cost, we can't provide an instruction (it doesn't exist yet)
901 InstructionCost GenericCastCost = TTI.getCastInstrCost(
902 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
903
904 InstructionCost NewCost =
905 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
906 GenericCastCost;
907
908 // Account for multi-use casts using specific costs
909 if (!LHSCast->hasOneUse())
910 NewCost += LHSCastCost;
911 if (!RHSCast->hasOneUse())
912 NewCost += RHSCastCost;
913
914 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
915 << " NewCost=" << NewCost << "\n");
916
917 if (NewCost > OldCost)
918 return false;
919
920 // Create the operation on the source type
921 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
922 BinOp->getName() + ".inner");
923 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
924 NewBinOp->copyIRFlags(BinOp);
925
926 Worklist.pushValue(NewOp);
927
928 // Create the cast operation directly to ensure we get a new instruction
929 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
930
931 // Preserve cast instruction flags
932 NewCast->copyIRFlags(LHSCast);
933 NewCast->andIRFlags(RHSCast);
934
935 // Insert the new instruction
936 Value *Result = Builder.Insert(NewCast);
937
938 replaceValue(I, *Result);
939 return true;
940}
941
942/// Match:
943// bitop(castop(x), C) ->
944// bitop(castop(x), castop(InvC)) ->
945// castop(bitop(x, InvC))
946// Supports: bitcast
947bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
949 Constant *C;
950
951 // Check if this is a bitwise logic operation
953 return false;
954
955 // Get the cast instructions
956 auto *LHSCast = dyn_cast<CastInst>(LHS);
957 if (!LHSCast)
958 return false;
959
960 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
961
962 // Only handle supported cast operations
963 switch (CastOpcode) {
964 case Instruction::BitCast:
965 break;
966 default:
967 return false;
968 }
969
970 Value *LHSSrc = LHSCast->getOperand(0);
971
972 auto *SrcTy = LHSSrc->getType();
973 auto *DstTy = I.getType();
974 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
975 // Other casts only handle vector types with integer elements.
976 if (CastOpcode != Instruction::BitCast &&
977 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
978 return false;
979
980 // Only integer scalar/vector values are legal for bitwise logic operations.
981 if (!SrcTy->getScalarType()->isIntegerTy() ||
982 !DstTy->getScalarType()->isIntegerTy())
983 return false;
984
985 // Find the constant InvC, such that castop(InvC) equals to C.
986 PreservedCastFlags RHSFlags;
987 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
988 if (!InvC)
989 return false;
990
991 // Cost Check :
992 // OldCost = bitlogic + cast
993 // NewCost = bitlogic + cast
994
995 // Calculate specific costs for each cast with instruction context
997 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
998
999 InstructionCost OldCost =
1000 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1001
1002 // For new cost, we can't provide an instruction (it doesn't exist yet)
1003 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1004 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1005
1006 InstructionCost NewCost =
1007 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1008 GenericCastCost;
1009
1010 // Account for multi-use casts using specific costs
1011 if (!LHSCast->hasOneUse())
1012 NewCost += LHSCastCost;
1013
1014 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1015 << " NewCost=" << NewCost << "\n");
1016
1017 if (NewCost > OldCost)
1018 return false;
1019
1020 // Create the operation on the source type
1021 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1022 LHSSrc, InvC, I.getName() + ".inner");
1023 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1024 NewBinOp->copyIRFlags(&I);
1025
1026 Worklist.pushValue(NewOp);
1027
1028 // Create the cast operation directly to ensure we get a new instruction
1029 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1030
1031 // Insert the new instruction
1032 Value *Result = Builder.Insert(NewCast);
1033
1034 replaceValue(I, *Result);
1035 return true;
1036}
1037
1038/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1039/// destination type followed by shuffle. This can enable further transforms by
1040/// moving bitcasts or shuffles together.
1041bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1042 Value *V0, *V1;
1043 ArrayRef<int> Mask;
1044 if (!match(&I, m_BitCast(m_OneUse(
1045 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1046 return false;
1047
1048 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1049 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1050 // mask for scalable type is a splat or not.
1051 // 2) Disallow non-vector casts.
1052 // TODO: We could allow any shuffle.
1053 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1054 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1055 if (!DestTy || !SrcTy)
1056 return false;
1057
1058 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1059 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1060 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1061 return false;
1062
1063 bool IsUnary = isa<UndefValue>(V1);
1064
1065 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1066 // if it won't increase the number of bitcasts.
1067 if (!IsUnary) {
1070 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1071 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1072 return false;
1073 }
1074
1075 SmallVector<int, 16> NewMask;
1076 if (DestEltSize <= SrcEltSize) {
1077 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1078 // always be expanded to the equivalent form choosing narrower elements.
1079 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1080 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1081 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1082 } else {
1083 // The bitcast is from narrow elements to wide elements. The shuffle mask
1084 // must choose consecutive elements to allow casting first.
1085 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1086 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1087 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1088 return false;
1089 }
1090
1091 // Bitcast the shuffle src - keep its original width but using the destination
1092 // scalar type.
1093 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1094 auto *NewShuffleTy =
1095 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1096 auto *OldShuffleTy =
1097 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1098 unsigned NumOps = IsUnary ? 1 : 2;
1099
1100 // The new shuffle must not cost more than the old shuffle.
1104
1105 InstructionCost NewCost =
1106 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1107 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1108 TargetTransformInfo::CastContextHint::None,
1109 CostKind));
1110 InstructionCost OldCost =
1111 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1112 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1113 TargetTransformInfo::CastContextHint::None,
1114 CostKind);
1115
1116 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1117 << OldCost << " vs NewCost: " << NewCost << "\n");
1118
1119 if (NewCost > OldCost || !NewCost.isValid())
1120 return false;
1121
1122 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1123 ++NumShufOfBitcast;
1124 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1125 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1126 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1127 replaceValue(I, *Shuf);
1128 return true;
1129}
1130
1131/// VP Intrinsics whose vector operands are both splat values may be simplified
1132/// into the scalar version of the operation and the result splatted. This
1133/// can lead to scalarization down the line.
1134bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1135 if (!isa<VPIntrinsic>(I))
1136 return false;
1137 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1138 Value *Op0 = VPI.getArgOperand(0);
1139 Value *Op1 = VPI.getArgOperand(1);
1140
1141 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1142 return false;
1143
1144 // Check getSplatValue early in this function, to avoid doing unnecessary
1145 // work.
1146 Value *ScalarOp0 = getSplatValue(Op0);
1147 Value *ScalarOp1 = getSplatValue(Op1);
1148 if (!ScalarOp0 || !ScalarOp1)
1149 return false;
1150
1151 // For the binary VP intrinsics supported here, the result on disabled lanes
1152 // is a poison value. For now, only do this simplification if all lanes
1153 // are active.
1154 // TODO: Relax the condition that all lanes are active by using insertelement
1155 // on inactive lanes.
1156 auto IsAllTrueMask = [](Value *MaskVal) {
1157 if (Value *SplattedVal = getSplatValue(MaskVal))
1158 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1159 return ConstValue->isAllOnesValue();
1160 return false;
1161 };
1162 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1163 return false;
1164
1165 // Check to make sure we support scalarization of the intrinsic
1166 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1167 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1168 return false;
1169
1170 // Calculate cost of splatting both operands into vectors and the vector
1171 // intrinsic
1172 VectorType *VecTy = cast<VectorType>(VPI.getType());
1173 SmallVector<int> Mask;
1174 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1175 Mask.resize(FVTy->getNumElements(), 0);
1176 InstructionCost SplatCost =
1177 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1179 CostKind);
1180
1181 // Calculate the cost of the VP Intrinsic
1183 for (Value *V : VPI.args())
1184 Args.push_back(V->getType());
1185 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1186 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1187 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1188
1189 // Determine scalar opcode
1190 std::optional<unsigned> FunctionalOpcode =
1191 VPI.getFunctionalOpcode();
1192 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1193 if (!FunctionalOpcode) {
1194 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1195 if (!ScalarIntrID)
1196 return false;
1197 }
1198
1199 // Calculate cost of scalarizing
1200 InstructionCost ScalarOpCost = 0;
1201 if (ScalarIntrID) {
1202 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1203 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1204 } else {
1205 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1206 VecTy->getScalarType(), CostKind);
1207 }
1208
1209 // The existing splats may be kept around if other instructions use them.
1210 InstructionCost CostToKeepSplats =
1211 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1212 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1213
1214 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1215 << "\n");
1216 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1217 << ", Cost of scalarizing:" << NewCost << "\n");
1218
1219 // We want to scalarize unless the vector variant actually has lower cost.
1220 if (OldCost < NewCost || !NewCost.isValid())
1221 return false;
1222
1223 // Scalarize the intrinsic
1224 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1225 Value *EVL = VPI.getArgOperand(3);
1226
1227 // If the VP op might introduce UB or poison, we can scalarize it provided
1228 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1229 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1230 // scalarizing it.
1231 bool SafeToSpeculate;
1232 if (ScalarIntrID)
1233 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1234 .hasAttribute(Attribute::AttrKind::Speculatable);
1235 else
1237 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1238 if (!SafeToSpeculate &&
1239 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1240 return false;
1241
1242 Value *ScalarVal =
1243 ScalarIntrID
1244 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1245 {ScalarOp0, ScalarOp1})
1246 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1247 ScalarOp0, ScalarOp1);
1248
1249 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1250 return true;
1251}
1252
1253/// Match a vector op/compare/intrinsic with at least one
1254/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1255/// by insertelement.
1256bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1257 auto *UO = dyn_cast<UnaryOperator>(&I);
1258 auto *BO = dyn_cast<BinaryOperator>(&I);
1259 auto *CI = dyn_cast<CmpInst>(&I);
1260 auto *II = dyn_cast<IntrinsicInst>(&I);
1261 if (!UO && !BO && !CI && !II)
1262 return false;
1263
1264 // TODO: Allow intrinsics with different argument types
1265 if (II) {
1266 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1267 return false;
1268 for (auto [Idx, Arg] : enumerate(II->args()))
1269 if (Arg->getType() != II->getType() &&
1270 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1271 return false;
1272 }
1273
1274 // Do not convert the vector condition of a vector select into a scalar
1275 // condition. That may cause problems for codegen because of differences in
1276 // boolean formats and register-file transfers.
1277 // TODO: Can we account for that in the cost model?
1278 if (CI)
1279 for (User *U : I.users())
1280 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1281 return false;
1282
1283 // Match constant vectors or scalars being inserted into constant vectors:
1284 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1285 SmallVector<Value *> VecCs, ScalarOps;
1286 std::optional<uint64_t> Index;
1287
1288 auto Ops = II ? II->args() : I.operands();
1289 for (auto [OpNum, Op] : enumerate(Ops)) {
1290 Constant *VecC;
1291 Value *V;
1292 uint64_t InsIdx = 0;
1293 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1294 m_ConstantInt(InsIdx)))) {
1295 // Bail if any inserts are out of bounds.
1296 VectorType *OpTy = cast<VectorType>(Op->getType());
1297 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1298 return false;
1299 // All inserts must have the same index.
1300 // TODO: Deal with mismatched index constants and variable indexes?
1301 if (!Index)
1302 Index = InsIdx;
1303 else if (InsIdx != *Index)
1304 return false;
1305 VecCs.push_back(VecC);
1306 ScalarOps.push_back(V);
1307 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1308 OpNum, &TTI)) {
1309 VecCs.push_back(Op.get());
1310 ScalarOps.push_back(Op.get());
1311 } else if (match(Op.get(), m_Constant(VecC))) {
1312 VecCs.push_back(VecC);
1313 ScalarOps.push_back(nullptr);
1314 } else {
1315 return false;
1316 }
1317 }
1318
1319 // Bail if all operands are constant.
1320 if (!Index.has_value())
1321 return false;
1322
1323 VectorType *VecTy = cast<VectorType>(I.getType());
1324 Type *ScalarTy = VecTy->getScalarType();
1325 assert(VecTy->isVectorTy() &&
1326 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1327 ScalarTy->isPointerTy()) &&
1328 "Unexpected types for insert element into binop or cmp");
1329
1330 unsigned Opcode = I.getOpcode();
1331 InstructionCost ScalarOpCost, VectorOpCost;
1332 if (CI) {
1333 CmpInst::Predicate Pred = CI->getPredicate();
1334 ScalarOpCost = TTI.getCmpSelInstrCost(
1335 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1336 VectorOpCost = TTI.getCmpSelInstrCost(
1337 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1338 } else if (UO || BO) {
1339 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1340 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1341 } else {
1342 IntrinsicCostAttributes ScalarICA(
1343 II->getIntrinsicID(), ScalarTy,
1344 SmallVector<Type *>(II->arg_size(), ScalarTy));
1345 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1346 IntrinsicCostAttributes VectorICA(
1347 II->getIntrinsicID(), VecTy,
1348 SmallVector<Type *>(II->arg_size(), VecTy));
1349 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1350 }
1351
1352 // Fold the vector constants in the original vectors into a new base vector to
1353 // get more accurate cost modelling.
1354 Value *NewVecC = nullptr;
1355 if (CI)
1356 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1357 else if (UO)
1358 NewVecC =
1359 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1360 else if (BO)
1361 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1362 else if (II)
1363 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1364
1365 if (!NewVecC)
1366 return false;
1367
1368 // Get cost estimate for the insert element. This cost will factor into
1369 // both sequences.
1370 InstructionCost OldCost = VectorOpCost;
1371 InstructionCost NewCost =
1372 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1373 CostKind, *Index, NewVecC);
1374
1375 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1376 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1377 II->getIntrinsicID(), Idx, &TTI)))
1378 continue;
1380 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1381 OldCost += InsertCost;
1382 NewCost += !Op->hasOneUse() * InsertCost;
1383 }
1384
1385 // We want to scalarize unless the vector variant actually has lower cost.
1386 if (OldCost < NewCost || !NewCost.isValid())
1387 return false;
1388
1389 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1390 // inselt NewVecC, (scalar_op V0, V1), Index
1391 if (CI)
1392 ++NumScalarCmp;
1393 else if (UO || BO)
1394 ++NumScalarOps;
1395 else
1396 ++NumScalarIntrinsic;
1397
1398 // For constant cases, extract the scalar element, this should constant fold.
1399 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1400 if (!Scalar)
1402 cast<Constant>(VecC), Builder.getInt64(*Index));
1403
1404 Value *Scalar;
1405 if (CI)
1406 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1407 else if (UO || BO)
1408 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1409 else
1410 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1411
1412 Scalar->setName(I.getName() + ".scalar");
1413
1414 // All IR flags are safe to back-propagate. There is no potential for extra
1415 // poison to be created by the scalar instruction.
1416 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1417 ScalarInst->copyIRFlags(&I);
1418
1419 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1420 replaceValue(I, *Insert);
1421 return true;
1422}
1423
1424/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1425/// a vector into vector operations followed by extract. Note: The SLP pass
1426/// may miss this pattern because of implementation problems.
1427bool VectorCombine::foldExtractedCmps(Instruction &I) {
1428 auto *BI = dyn_cast<BinaryOperator>(&I);
1429
1430 // We are looking for a scalar binop of booleans.
1431 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1432 if (!BI || !I.getType()->isIntegerTy(1))
1433 return false;
1434
1435 // The compare predicates should match, and each compare should have a
1436 // constant operand.
1437 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1438 Instruction *I0, *I1;
1439 Constant *C0, *C1;
1440 CmpPredicate P0, P1;
1441 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1442 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1443 return false;
1444
1445 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1446 if (!MatchingPred)
1447 return false;
1448
1449 // The compare operands must be extracts of the same vector with constant
1450 // extract indexes.
1451 Value *X;
1452 uint64_t Index0, Index1;
1453 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1454 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1455 return false;
1456
1457 auto *Ext0 = cast<ExtractElementInst>(I0);
1458 auto *Ext1 = cast<ExtractElementInst>(I1);
1459 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1460 if (!ConvertToShuf)
1461 return false;
1462 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1463 "Unknown ExtractElementInst");
1464
1465 // The original scalar pattern is:
1466 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1467 CmpInst::Predicate Pred = *MatchingPred;
1468 unsigned CmpOpcode =
1469 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1470 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1471 if (!VecTy)
1472 return false;
1473
1474 InstructionCost Ext0Cost =
1475 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1476 InstructionCost Ext1Cost =
1477 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1479 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1480 CostKind);
1481
1482 InstructionCost OldCost =
1483 Ext0Cost + Ext1Cost + CmpCost * 2 +
1484 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1485
1486 // The proposed vector pattern is:
1487 // vcmp = cmp Pred X, VecC
1488 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1489 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1490 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1493 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1494 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1495 ShufMask[CheapIndex] = ExpensiveIndex;
1497 CmpTy, ShufMask, CostKind);
1498 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1499 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1500 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1501 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1502
1503 // Aggressively form vector ops if the cost is equal because the transform
1504 // may enable further optimization.
1505 // Codegen can reverse this transform (scalarize) if it was not profitable.
1506 if (OldCost < NewCost || !NewCost.isValid())
1507 return false;
1508
1509 // Create a vector constant from the 2 scalar constants.
1510 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1511 PoisonValue::get(VecTy->getElementType()));
1512 CmpC[Index0] = C0;
1513 CmpC[Index1] = C1;
1514 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1515 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1516 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1517 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1518 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1519 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1520 replaceValue(I, *NewExt);
1521 ++NumVecCmpBO;
1522 return true;
1523}
1524
1527 const TargetTransformInfo &TTI,
1528 InstructionCost &CostBeforeReduction,
1529 InstructionCost &CostAfterReduction) {
1530 Instruction *Op0, *Op1;
1531 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1532 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1533 unsigned ReductionOpc =
1534 getArithmeticReductionInstruction(II.getIntrinsicID());
1535 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1536 bool IsUnsigned = isa<ZExtInst>(RedOp);
1537 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1538
1539 CostBeforeReduction =
1540 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1542 CostAfterReduction =
1543 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1544 ExtType, FastMathFlags(), CostKind);
1545 return;
1546 }
1547 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1548 match(RedOp,
1550 match(Op0, m_ZExtOrSExt(m_Value())) &&
1551 Op0->getOpcode() == Op1->getOpcode() &&
1552 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1553 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1554 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1555 bool IsUnsigned = isa<ZExtInst>(Op0);
1556 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1557 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1558
1559 InstructionCost ExtCost =
1560 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1562 InstructionCost MulCost =
1563 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1564 InstructionCost Ext2Cost =
1565 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1567
1568 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1569 CostAfterReduction = TTI.getMulAccReductionCost(
1570 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1571 return;
1572 }
1573 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1574 std::nullopt, CostKind);
1575}
1576
1577bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1578 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1579 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1580 if (BinOpOpc == Instruction::Sub)
1581 ReductionIID = Intrinsic::vector_reduce_add;
1582 if (ReductionIID == Intrinsic::not_intrinsic)
1583 return false;
1584
1585 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1586 Intrinsic::ID IID) -> Value * {
1587 auto *II = dyn_cast<IntrinsicInst>(V);
1588 if (!II)
1589 return nullptr;
1590 if (II->getIntrinsicID() == IID && II->hasOneUse())
1591 return II->getArgOperand(0);
1592 return nullptr;
1593 };
1594
1595 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1596 if (!V0)
1597 return false;
1598 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1599 if (!V1)
1600 return false;
1601
1602 auto *VTy = cast<VectorType>(V0->getType());
1603 if (V1->getType() != VTy)
1604 return false;
1605 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1606 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1607 unsigned ReductionOpc =
1608 getArithmeticReductionInstruction(II0.getIntrinsicID());
1609
1610 InstructionCost OldCost = 0;
1611 InstructionCost NewCost = 0;
1612 InstructionCost CostOfRedOperand0 = 0;
1613 InstructionCost CostOfRed0 = 0;
1614 InstructionCost CostOfRedOperand1 = 0;
1615 InstructionCost CostOfRed1 = 0;
1616 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1617 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1618 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1619 NewCost =
1620 CostOfRedOperand0 + CostOfRedOperand1 +
1621 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1622 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1623 if (NewCost >= OldCost || !NewCost.isValid())
1624 return false;
1625
1626 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1627 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1628 << "\n");
1629 Value *VectorBO;
1630 if (BinOpOpc == Instruction::Or)
1631 VectorBO = Builder.CreateOr(V0, V1, "",
1632 cast<PossiblyDisjointInst>(I).isDisjoint());
1633 else
1634 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1635
1636 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1637 replaceValue(I, *Rdx);
1638 return true;
1639}
1640
1641// Check if memory loc modified between two instrs in the same BB
1644 const MemoryLocation &Loc, AAResults &AA) {
1645 unsigned NumScanned = 0;
1646 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1647 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1648 ++NumScanned > MaxInstrsToScan;
1649 });
1650}
1651
1652namespace {
1653/// Helper class to indicate whether a vector index can be safely scalarized and
1654/// if a freeze needs to be inserted.
1655class ScalarizationResult {
1656 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1657
1658 StatusTy Status;
1659 Value *ToFreeze;
1660
1661 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1662 : Status(Status), ToFreeze(ToFreeze) {}
1663
1664public:
1665 ScalarizationResult(const ScalarizationResult &Other) = default;
1666 ~ScalarizationResult() {
1667 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1668 }
1669
1670 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1671 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1672 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1673 return {StatusTy::SafeWithFreeze, ToFreeze};
1674 }
1675
1676 /// Returns true if the index can be scalarize without requiring a freeze.
1677 bool isSafe() const { return Status == StatusTy::Safe; }
1678 /// Returns true if the index cannot be scalarized.
1679 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1680 /// Returns true if the index can be scalarize, but requires inserting a
1681 /// freeze.
1682 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1683
1684 /// Reset the state of Unsafe and clear ToFreze if set.
1685 void discard() {
1686 ToFreeze = nullptr;
1687 Status = StatusTy::Unsafe;
1688 }
1689
1690 /// Freeze the ToFreeze and update the use in \p User to use it.
1691 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1692 assert(isSafeWithFreeze() &&
1693 "should only be used when freezing is required");
1694 assert(is_contained(ToFreeze->users(), &UserI) &&
1695 "UserI must be a user of ToFreeze");
1696 IRBuilder<>::InsertPointGuard Guard(Builder);
1697 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1698 Value *Frozen =
1699 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1700 for (Use &U : make_early_inc_range((UserI.operands())))
1701 if (U.get() == ToFreeze)
1702 U.set(Frozen);
1703
1704 ToFreeze = nullptr;
1705 }
1706};
1707} // namespace
1708
1709/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1710/// Idx. \p Idx must access a valid vector element.
1711static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1712 Instruction *CtxI,
1713 AssumptionCache &AC,
1714 const DominatorTree &DT) {
1715 // We do checks for both fixed vector types and scalable vector types.
1716 // This is the number of elements of fixed vector types,
1717 // or the minimum number of elements of scalable vector types.
1718 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1719 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1720
1721 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1722 if (C->getValue().ult(NumElements))
1723 return ScalarizationResult::safe();
1724 return ScalarizationResult::unsafe();
1725 }
1726
1727 // Always unsafe if the index type can't handle all inbound values.
1728 if (!llvm::isUIntN(IntWidth, NumElements))
1729 return ScalarizationResult::unsafe();
1730
1731 APInt Zero(IntWidth, 0);
1732 APInt MaxElts(IntWidth, NumElements);
1733 ConstantRange ValidIndices(Zero, MaxElts);
1734 ConstantRange IdxRange(IntWidth, true);
1735
1736 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1737 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1738 true, &AC, CtxI, &DT)))
1739 return ScalarizationResult::safe();
1740 return ScalarizationResult::unsafe();
1741 }
1742
1743 // If the index may be poison, check if we can insert a freeze before the
1744 // range of the index is restricted.
1745 Value *IdxBase;
1746 ConstantInt *CI;
1747 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1748 IdxRange = IdxRange.binaryAnd(CI->getValue());
1749 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1750 IdxRange = IdxRange.urem(CI->getValue());
1751 }
1752
1753 if (ValidIndices.contains(IdxRange))
1754 return ScalarizationResult::safeWithFreeze(IdxBase);
1755 return ScalarizationResult::unsafe();
1756}
1757
1758/// The memory operation on a vector of \p ScalarType had alignment of
1759/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1760/// alignment that will be valid for the memory operation on a single scalar
1761/// element of the same type with index \p Idx.
1763 Type *ScalarType, Value *Idx,
1764 const DataLayout &DL) {
1765 if (auto *C = dyn_cast<ConstantInt>(Idx))
1766 return commonAlignment(VectorAlignment,
1767 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1768 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1769}
1770
1771// Combine patterns like:
1772// %0 = load <4 x i32>, <4 x i32>* %a
1773// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1774// store <4 x i32> %1, <4 x i32>* %a
1775// to:
1776// %0 = bitcast <4 x i32>* %a to i32*
1777// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1778// store i32 %b, i32* %1
1779bool VectorCombine::foldSingleElementStore(Instruction &I) {
1781 return false;
1782 auto *SI = cast<StoreInst>(&I);
1783 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1784 return false;
1785
1786 // TODO: Combine more complicated patterns (multiple insert) by referencing
1787 // TargetTransformInfo.
1789 Value *NewElement;
1790 Value *Idx;
1791 if (!match(SI->getValueOperand(),
1792 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1793 m_Value(Idx))))
1794 return false;
1795
1796 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1797 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1798 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1799 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1800 // modified between, vector type matches store size, and index is inbounds.
1801 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1802 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1803 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1804 return false;
1805
1806 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1807 if (ScalarizableIdx.isUnsafe() ||
1808 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1809 MemoryLocation::get(SI), AA))
1810 return false;
1811
1812 // Ensure we add the load back to the worklist BEFORE its users so they can
1813 // erased in the correct order.
1814 Worklist.push(Load);
1815
1816 if (ScalarizableIdx.isSafeWithFreeze())
1817 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1818 Value *GEP = Builder.CreateInBoundsGEP(
1819 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1820 {ConstantInt::get(Idx->getType(), 0), Idx});
1821 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1822 NSI->copyMetadata(*SI);
1823 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1824 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1825 *DL);
1826 NSI->setAlignment(ScalarOpAlignment);
1827 replaceValue(I, *NSI);
1829 return true;
1830 }
1831
1832 return false;
1833}
1834
1835/// Try to scalarize vector loads feeding extractelement instructions.
1836bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1838 return false;
1839
1840 Value *Ptr;
1841 if (!match(&I, m_Load(m_Value(Ptr))))
1842 return false;
1843
1844 auto *LI = cast<LoadInst>(&I);
1845 auto *VecTy = cast<VectorType>(LI->getType());
1846 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1847 return false;
1848
1849 InstructionCost OriginalCost =
1850 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1851 LI->getPointerAddressSpace(), CostKind);
1852 InstructionCost ScalarizedCost = 0;
1853
1854 Instruction *LastCheckedInst = LI;
1855 unsigned NumInstChecked = 0;
1856 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1857 auto FailureGuard = make_scope_exit([&]() {
1858 // If the transform is aborted, discard the ScalarizationResults.
1859 for (auto &Pair : NeedFreeze)
1860 Pair.second.discard();
1861 });
1862
1863 // Check if all users of the load are extracts with no memory modifications
1864 // between the load and the extract. Compute the cost of both the original
1865 // code and the scalarized version.
1866 for (User *U : LI->users()) {
1867 auto *UI = dyn_cast<ExtractElementInst>(U);
1868 if (!UI || UI->getParent() != LI->getParent())
1869 return false;
1870
1871 // If any extract is waiting to be erased, then bail out as this will
1872 // distort the cost calculation and possibly lead to infinite loops.
1873 if (UI->use_empty())
1874 return false;
1875
1876 // Check if any instruction between the load and the extract may modify
1877 // memory.
1878 if (LastCheckedInst->comesBefore(UI)) {
1879 for (Instruction &I :
1880 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1881 // Bail out if we reached the check limit or the instruction may write
1882 // to memory.
1883 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1884 return false;
1885 NumInstChecked++;
1886 }
1887 LastCheckedInst = UI;
1888 }
1889
1890 auto ScalarIdx =
1891 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1892 if (ScalarIdx.isUnsafe())
1893 return false;
1894 if (ScalarIdx.isSafeWithFreeze()) {
1895 NeedFreeze.try_emplace(UI, ScalarIdx);
1896 ScalarIdx.discard();
1897 }
1898
1899 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1900 OriginalCost +=
1901 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1902 Index ? Index->getZExtValue() : -1);
1903 ScalarizedCost +=
1904 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1905 Align(1), LI->getPointerAddressSpace(), CostKind);
1906 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1907 nullptr, nullptr, CostKind);
1908 }
1909
1910 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
1911 << "\n LoadExtractCost: " << OriginalCost
1912 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1913
1914 if (ScalarizedCost >= OriginalCost)
1915 return false;
1916
1917 // Ensure we add the load back to the worklist BEFORE its users so they can
1918 // erased in the correct order.
1919 Worklist.push(LI);
1920
1921 Type *ElemType = VecTy->getElementType();
1922
1923 // Replace extracts with narrow scalar loads.
1924 for (User *U : LI->users()) {
1925 auto *EI = cast<ExtractElementInst>(U);
1926 Value *Idx = EI->getIndexOperand();
1927
1928 // Insert 'freeze' for poison indexes.
1929 auto It = NeedFreeze.find(EI);
1930 if (It != NeedFreeze.end())
1931 It->second.freeze(Builder, *cast<Instruction>(Idx));
1932
1933 Builder.SetInsertPoint(EI);
1934 Value *GEP =
1935 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1936 auto *NewLoad = cast<LoadInst>(
1937 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1938
1939 Align ScalarOpAlignment =
1940 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1941 NewLoad->setAlignment(ScalarOpAlignment);
1942
1943 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1944 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1945 AAMDNodes OldAAMD = LI->getAAMetadata();
1946 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1947 }
1948
1949 replaceValue(*EI, *NewLoad, false);
1950 }
1951
1952 FailureGuard.release();
1953 return true;
1954}
1955
1956bool VectorCombine::scalarizeExtExtract(Instruction &I) {
1958 return false;
1959 auto *Ext = dyn_cast<ZExtInst>(&I);
1960 if (!Ext)
1961 return false;
1962
1963 // Try to convert a vector zext feeding only extracts to a set of scalar
1964 // (Src << ExtIdx *Size) & (Size -1)
1965 // if profitable .
1966 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
1967 if (!SrcTy)
1968 return false;
1969 auto *DstTy = cast<FixedVectorType>(Ext->getType());
1970
1971 Type *ScalarDstTy = DstTy->getElementType();
1972 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
1973 return false;
1974
1975 InstructionCost VectorCost =
1976 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
1978 unsigned ExtCnt = 0;
1979 bool ExtLane0 = false;
1980 for (User *U : Ext->users()) {
1981 uint64_t Idx;
1982 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
1983 return false;
1984 if (cast<Instruction>(U)->use_empty())
1985 continue;
1986 ExtCnt += 1;
1987 ExtLane0 |= !Idx;
1988 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
1989 CostKind, Idx, U);
1990 }
1991
1992 InstructionCost ScalarCost =
1993 ExtCnt * TTI.getArithmeticInstrCost(
1994 Instruction::And, ScalarDstTy, CostKind,
1997 (ExtCnt - ExtLane0) *
1999 Instruction::LShr, ScalarDstTy, CostKind,
2002 if (ScalarCost > VectorCost)
2003 return false;
2004
2005 Value *ScalarV = Ext->getOperand(0);
2006 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2007 &DT))
2008 ScalarV = Builder.CreateFreeze(ScalarV);
2009 ScalarV = Builder.CreateBitCast(
2010 ScalarV,
2011 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2012 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2013 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2014 for (User *U : Ext->users()) {
2015 auto *Extract = cast<ExtractElementInst>(U);
2016 uint64_t Idx =
2017 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2018 Value *LShr = Builder.CreateLShr(ScalarV, Idx * SrcEltSizeInBits);
2019 Value *And = Builder.CreateAnd(LShr, EltBitMask);
2020 U->replaceAllUsesWith(And);
2021 }
2022 return true;
2023}
2024
2025/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2026/// to "(bitcast (concat X, Y))"
2027/// where X/Y are bitcasted from i1 mask vectors.
2028bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2029 Type *Ty = I.getType();
2030 if (!Ty->isIntegerTy())
2031 return false;
2032
2033 // TODO: Add big endian test coverage
2034 if (DL->isBigEndian())
2035 return false;
2036
2037 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2038 Instruction *X, *Y;
2040 return false;
2041
2042 // Allow both sources to contain shl, to handle more generic pattern:
2043 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2044 Value *SrcX;
2045 uint64_t ShAmtX = 0;
2046 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2047 !match(X, m_OneUse(
2049 m_ConstantInt(ShAmtX)))))
2050 return false;
2051
2052 Value *SrcY;
2053 uint64_t ShAmtY = 0;
2054 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2055 !match(Y, m_OneUse(
2057 m_ConstantInt(ShAmtY)))))
2058 return false;
2059
2060 // Canonicalize larger shift to the RHS.
2061 if (ShAmtX > ShAmtY) {
2062 std::swap(X, Y);
2063 std::swap(SrcX, SrcY);
2064 std::swap(ShAmtX, ShAmtY);
2065 }
2066
2067 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2068 // difference is the mask width so they can be easily concatenated together.
2069 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2070 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2071 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2072 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2073 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2074 !MaskTy->getElementType()->isIntegerTy(1) ||
2075 MaskTy->getNumElements() != ShAmtDiff ||
2076 MaskTy->getNumElements() > (BitWidth / 2))
2077 return false;
2078
2079 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2080 auto *ConcatIntTy =
2081 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2082 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2083
2084 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2085 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2086
2087 // TODO: Is it worth supporting multi use cases?
2088 InstructionCost OldCost = 0;
2089 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2090 OldCost +=
2091 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2092 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2094 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2096
2097 InstructionCost NewCost = 0;
2099 MaskTy, ConcatMask, CostKind);
2100 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2102 if (Ty != ConcatIntTy)
2103 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2105 if (ShAmtX > 0)
2106 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2107
2108 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2109 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2110 << "\n");
2111
2112 if (NewCost > OldCost)
2113 return false;
2114
2115 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2116 // any residual zero-extension or shifting.
2117 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2118 Worklist.pushValue(Concat);
2119
2120 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2121
2122 if (Ty != ConcatIntTy) {
2123 Worklist.pushValue(Result);
2124 Result = Builder.CreateZExt(Result, Ty);
2125 }
2126
2127 if (ShAmtX > 0) {
2128 Worklist.pushValue(Result);
2129 Result = Builder.CreateShl(Result, ShAmtX);
2130 }
2131
2132 replaceValue(I, *Result);
2133 return true;
2134}
2135
2136/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2137/// --> "binop (shuffle), (shuffle)".
2138bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2139 BinaryOperator *BinOp;
2140 ArrayRef<int> OuterMask;
2141 if (!match(&I,
2142 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2143 return false;
2144
2145 // Don't introduce poison into div/rem.
2146 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2147 return false;
2148
2149 Value *Op00, *Op01, *Op10, *Op11;
2150 ArrayRef<int> Mask0, Mask1;
2151 bool Match0 =
2152 match(BinOp->getOperand(0),
2153 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2154 bool Match1 =
2155 match(BinOp->getOperand(1),
2156 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2157 if (!Match0 && !Match1)
2158 return false;
2159
2160 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2161 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2162 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2163 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2164
2165 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2166 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2167 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2168 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2169 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2170 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2171 return false;
2172
2173 unsigned NumSrcElts = BinOpTy->getNumElements();
2174
2175 // Don't accept shuffles that reference the second operand in
2176 // div/rem or if its an undef arg.
2177 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2178 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2179 return false;
2180
2181 // Merge outer / inner (or identity if no match) shuffles.
2182 SmallVector<int> NewMask0, NewMask1;
2183 for (int M : OuterMask) {
2184 if (M < 0 || M >= (int)NumSrcElts) {
2185 NewMask0.push_back(PoisonMaskElem);
2186 NewMask1.push_back(PoisonMaskElem);
2187 } else {
2188 NewMask0.push_back(Match0 ? Mask0[M] : M);
2189 NewMask1.push_back(Match1 ? Mask1[M] : M);
2190 }
2191 }
2192
2193 unsigned NumOpElts = Op0Ty->getNumElements();
2194 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2195 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2196 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2197 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2198 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2199 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2200
2201 // Try to merge shuffles across the binop if the new shuffles are not costly.
2202 InstructionCost OldCost =
2203 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2205 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2206 if (Match0)
2207 OldCost += TTI.getShuffleCost(
2208 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2209 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2210 if (Match1)
2211 OldCost += TTI.getShuffleCost(
2212 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2213 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2214
2215 InstructionCost NewCost =
2216 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2217
2218 if (!IsIdentity0)
2219 NewCost +=
2221 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2222 if (!IsIdentity1)
2223 NewCost +=
2225 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2226
2227 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2228 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2229 << "\n");
2230
2231 // If costs are equal, still fold as we reduce instruction count.
2232 if (NewCost > OldCost)
2233 return false;
2234
2235 Value *LHS =
2236 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2237 Value *RHS =
2238 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2239 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2240
2241 // Intersect flags from the old binops.
2242 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2243 NewInst->copyIRFlags(BinOp);
2244
2245 Worklist.pushValue(LHS);
2246 Worklist.pushValue(RHS);
2247 replaceValue(I, *NewBO);
2248 return true;
2249}
2250
2251/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2252/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2253bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2254 ArrayRef<int> OldMask;
2255 Instruction *LHS, *RHS;
2257 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2258 return false;
2259
2260 // TODO: Add support for addlike etc.
2261 if (LHS->getOpcode() != RHS->getOpcode())
2262 return false;
2263
2264 Value *X, *Y, *Z, *W;
2265 bool IsCommutative = false;
2266 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2267 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2268 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2269 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2270 auto *BO = cast<BinaryOperator>(LHS);
2271 // Don't introduce poison into div/rem.
2272 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2273 return false;
2274 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2275 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2276 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2277 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2278 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2279 } else
2280 return false;
2281
2282 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2283 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2284 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2285 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2286 return false;
2287
2288 unsigned NumSrcElts = BinOpTy->getNumElements();
2289
2290 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2291 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2292 std::swap(X, Y);
2293
2294 auto ConvertToUnary = [NumSrcElts](int &M) {
2295 if (M >= (int)NumSrcElts)
2296 M -= NumSrcElts;
2297 };
2298
2299 SmallVector<int> NewMask0(OldMask);
2301 if (X == Z) {
2302 llvm::for_each(NewMask0, ConvertToUnary);
2304 Z = PoisonValue::get(BinOpTy);
2305 }
2306
2307 SmallVector<int> NewMask1(OldMask);
2309 if (Y == W) {
2310 llvm::for_each(NewMask1, ConvertToUnary);
2312 W = PoisonValue::get(BinOpTy);
2313 }
2314
2315 // Try to replace a binop with a shuffle if the shuffle is not costly.
2316 InstructionCost OldCost =
2320 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2321 &I);
2322
2323 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2324 // where one use shuffles have gotten split across the binop/cmp. These
2325 // often allow a major reduction in total cost that wouldn't happen as
2326 // individual folds.
2327 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2328 TTI::TargetCostKind CostKind) -> bool {
2329 Value *InnerOp;
2330 ArrayRef<int> InnerMask;
2331 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2332 m_Mask(InnerMask)))) &&
2333 InnerOp->getType() == Op->getType() &&
2334 all_of(InnerMask,
2335 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2336 for (int &M : Mask)
2337 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2338 M = InnerMask[M - Offset];
2339 M = 0 <= M ? M + Offset : M;
2340 }
2342 Op = InnerOp;
2343 return true;
2344 }
2345 return false;
2346 };
2347 bool ReducedInstCount = false;
2348 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2349 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2350 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2351 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2352
2353 auto *ShuffleCmpTy =
2354 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2355 InstructionCost NewCost =
2356 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2357 nullptr, {X, Z}) +
2358 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2359 nullptr, {Y, W});
2360
2361 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2362 NewCost +=
2363 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2364 } else {
2365 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2366 ShuffleDstTy, PredLHS, CostKind);
2367 }
2368
2369 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2370 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2371 << "\n");
2372
2373 // If either shuffle will constant fold away, then fold for the same cost as
2374 // we will reduce the instruction count.
2375 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2376 (isa<Constant>(Y) && isa<Constant>(W));
2377 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2378 return false;
2379
2380 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2381 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2382 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2383 ? Builder.CreateBinOp(
2384 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2385 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2386
2387 // Intersect flags from the old binops.
2388 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2389 NewInst->copyIRFlags(LHS);
2390 NewInst->andIRFlags(RHS);
2391 }
2392
2393 Worklist.pushValue(Shuf0);
2394 Worklist.pushValue(Shuf1);
2395 replaceValue(I, *NewBO);
2396 return true;
2397}
2398
2399/// Try to convert,
2400/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2401/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2402bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2403 ArrayRef<int> Mask;
2404 Value *C1, *T1, *F1, *C2, *T2, *F2;
2405 if (!match(&I, m_Shuffle(
2407 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2408 m_Mask(Mask))))
2409 return false;
2410
2411 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2412 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2413 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2414 return false;
2415
2416 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2417 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2418 // SelectInsts must have the same FMF.
2419 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2420 ((SI0FOp != nullptr) &&
2421 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2422 return false;
2423
2424 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2425 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2427 auto SelOp = Instruction::Select;
2429 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2430 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2432 OldCost +=
2433 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2434 {I.getOperand(0), I.getOperand(1)}, &I);
2435
2437 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2438 Mask, CostKind, 0, nullptr, {C1, C2});
2439 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2440 nullptr, {T1, T2});
2441 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2442 nullptr, {F1, F2});
2443 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2444 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2445 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2447
2448 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2449 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2450 << "\n");
2451 if (NewCost > OldCost)
2452 return false;
2453
2454 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2455 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2456 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2457 Value *NewSel;
2458 // We presuppose that the SelectInsts have the same FMF.
2459 if (SI0FOp)
2460 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2461 SI0FOp->getFastMathFlags());
2462 else
2463 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2464
2465 Worklist.pushValue(ShuffleCmp);
2466 Worklist.pushValue(ShuffleTrue);
2467 Worklist.pushValue(ShuffleFalse);
2468 replaceValue(I, *NewSel);
2469 return true;
2470}
2471
2472/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2473/// into "castop (shuffle)".
2474bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2475 Value *V0, *V1;
2476 ArrayRef<int> OldMask;
2477 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2478 return false;
2479
2480 auto *C0 = dyn_cast<CastInst>(V0);
2481 auto *C1 = dyn_cast<CastInst>(V1);
2482 if (!C0 || !C1)
2483 return false;
2484
2485 Instruction::CastOps Opcode = C0->getOpcode();
2486 if (C0->getSrcTy() != C1->getSrcTy())
2487 return false;
2488
2489 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2490 if (Opcode != C1->getOpcode()) {
2491 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2492 Opcode = Instruction::SExt;
2493 else
2494 return false;
2495 }
2496
2497 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2498 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2499 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2500 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2501 return false;
2502
2503 unsigned NumSrcElts = CastSrcTy->getNumElements();
2504 unsigned NumDstElts = CastDstTy->getNumElements();
2505 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2506 "Only bitcasts expected to alter src/dst element counts");
2507
2508 // Check for bitcasting of unscalable vector types.
2509 // e.g. <32 x i40> -> <40 x i32>
2510 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2511 (NumDstElts % NumSrcElts) != 0)
2512 return false;
2513
2514 SmallVector<int, 16> NewMask;
2515 if (NumSrcElts >= NumDstElts) {
2516 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2517 // always be expanded to the equivalent form choosing narrower elements.
2518 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2519 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2520 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2521 } else {
2522 // The bitcast is from narrow elements to wide elements. The shuffle mask
2523 // must choose consecutive elements to allow casting first.
2524 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2525 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2526 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2527 return false;
2528 }
2529
2530 auto *NewShuffleDstTy =
2531 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2532
2533 // Try to replace a castop with a shuffle if the shuffle is not costly.
2534 InstructionCost CostC0 =
2535 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2537 InstructionCost CostC1 =
2538 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2540 InstructionCost OldCost = CostC0 + CostC1;
2541 OldCost +=
2543 CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
2544
2545 InstructionCost NewCost =
2547 CastSrcTy, NewMask, CostKind);
2548 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2550 if (!C0->hasOneUse())
2551 NewCost += CostC0;
2552 if (!C1->hasOneUse())
2553 NewCost += CostC1;
2554
2555 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2556 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2557 << "\n");
2558 if (NewCost > OldCost)
2559 return false;
2560
2561 Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
2562 C1->getOperand(0), NewMask);
2563 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2564
2565 // Intersect flags from the old casts.
2566 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2567 NewInst->copyIRFlags(C0);
2568 NewInst->andIRFlags(C1);
2569 }
2570
2571 Worklist.pushValue(Shuf);
2572 replaceValue(I, *Cast);
2573 return true;
2574}
2575
2576/// Try to convert any of:
2577/// "shuffle (shuffle x, y), (shuffle y, x)"
2578/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2579/// "shuffle (shuffle x, undef), y"
2580/// "shuffle x, (shuffle y, undef)"
2581/// into "shuffle x, y".
2582bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2583 ArrayRef<int> OuterMask;
2584 Value *OuterV0, *OuterV1;
2585 if (!match(&I,
2586 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2587 return false;
2588
2589 ArrayRef<int> InnerMask0, InnerMask1;
2590 Value *X0, *X1, *Y0, *Y1;
2591 bool Match0 =
2592 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2593 bool Match1 =
2594 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2595 if (!Match0 && !Match1)
2596 return false;
2597
2598 // If the outer shuffle is a permute, then create a fake inner all-poison
2599 // shuffle. This is easier than accounting for length-changing shuffles below.
2600 SmallVector<int, 16> PoisonMask1;
2601 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2602 X1 = X0;
2603 Y1 = Y0;
2604 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2605 InnerMask1 = PoisonMask1;
2606 Match1 = true; // fake match
2607 }
2608
2609 X0 = Match0 ? X0 : OuterV0;
2610 Y0 = Match0 ? Y0 : OuterV0;
2611 X1 = Match1 ? X1 : OuterV1;
2612 Y1 = Match1 ? Y1 : OuterV1;
2613 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2614 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2615 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2616 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2617 X0->getType() != X1->getType())
2618 return false;
2619
2620 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2621 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2622
2623 // Attempt to merge shuffles, matching upto 2 source operands.
2624 // Replace index to a poison arg with PoisonMaskElem.
2625 // Bail if either inner masks reference an undef arg.
2626 SmallVector<int, 16> NewMask(OuterMask);
2627 Value *NewX = nullptr, *NewY = nullptr;
2628 for (int &M : NewMask) {
2629 Value *Src = nullptr;
2630 if (0 <= M && M < (int)NumImmElts) {
2631 Src = OuterV0;
2632 if (Match0) {
2633 M = InnerMask0[M];
2634 Src = M >= (int)NumSrcElts ? Y0 : X0;
2635 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2636 }
2637 } else if (M >= (int)NumImmElts) {
2638 Src = OuterV1;
2639 M -= NumImmElts;
2640 if (Match1) {
2641 M = InnerMask1[M];
2642 Src = M >= (int)NumSrcElts ? Y1 : X1;
2643 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2644 }
2645 }
2646 if (Src && M != PoisonMaskElem) {
2647 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2648 if (isa<UndefValue>(Src)) {
2649 // We've referenced an undef element - if its poison, update the shuffle
2650 // mask, else bail.
2651 if (!isa<PoisonValue>(Src))
2652 return false;
2653 M = PoisonMaskElem;
2654 continue;
2655 }
2656 if (!NewX || NewX == Src) {
2657 NewX = Src;
2658 continue;
2659 }
2660 if (!NewY || NewY == Src) {
2661 M += NumSrcElts;
2662 NewY = Src;
2663 continue;
2664 }
2665 return false;
2666 }
2667 }
2668
2669 if (!NewX)
2670 return PoisonValue::get(ShuffleDstTy);
2671 if (!NewY)
2672 NewY = PoisonValue::get(ShuffleSrcTy);
2673
2674 // Have we folded to an Identity shuffle?
2675 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2676 replaceValue(I, *NewX);
2677 return true;
2678 }
2679
2680 // Try to merge the shuffles if the new shuffle is not costly.
2681 InstructionCost InnerCost0 = 0;
2682 if (Match0)
2683 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2684
2685 InstructionCost InnerCost1 = 0;
2686 if (Match1)
2687 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2688
2690
2691 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2692
2693 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2697 InstructionCost NewCost =
2698 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2699 nullptr, {NewX, NewY});
2700 if (!OuterV0->hasOneUse())
2701 NewCost += InnerCost0;
2702 if (!OuterV1->hasOneUse())
2703 NewCost += InnerCost1;
2704
2705 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2706 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2707 << "\n");
2708 if (NewCost > OldCost)
2709 return false;
2710
2711 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2712 replaceValue(I, *Shuf);
2713 return true;
2714}
2715
2716/// Try to convert
2717/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2718bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2719 Value *V0, *V1;
2720 ArrayRef<int> OldMask;
2721 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2722 m_Mask(OldMask))))
2723 return false;
2724
2725 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2726 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2727 if (!II0 || !II1)
2728 return false;
2729
2730 Intrinsic::ID IID = II0->getIntrinsicID();
2731 if (IID != II1->getIntrinsicID())
2732 return false;
2733
2734 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2735 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2736 if (!ShuffleDstTy || !II0Ty)
2737 return false;
2738
2739 if (!isTriviallyVectorizable(IID))
2740 return false;
2741
2742 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2744 II0->getArgOperand(I) != II1->getArgOperand(I))
2745 return false;
2746
2747 InstructionCost OldCost =
2748 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2749 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2751 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2752
2753 SmallVector<Type *> NewArgsTy;
2754 InstructionCost NewCost = 0;
2755 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2757 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2758 } else {
2759 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2760 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2761 ShuffleDstTy->getNumElements());
2762 NewArgsTy.push_back(ArgTy);
2764 ArgTy, VecTy, OldMask, CostKind);
2765 }
2766 }
2767 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2768 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2769
2770 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2771 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2772 << "\n");
2773
2774 if (NewCost > OldCost)
2775 return false;
2776
2777 SmallVector<Value *> NewArgs;
2778 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2780 NewArgs.push_back(II0->getArgOperand(I));
2781 } else {
2782 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2783 II1->getArgOperand(I), OldMask);
2784 NewArgs.push_back(Shuf);
2785 Worklist.pushValue(Shuf);
2786 }
2787 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2788
2789 // Intersect flags from the old intrinsics.
2790 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2791 NewInst->copyIRFlags(II0);
2792 NewInst->andIRFlags(II1);
2793 }
2794
2795 replaceValue(I, *NewIntrinsic);
2796 return true;
2797}
2798
2799using InstLane = std::pair<Use *, int>;
2800
2801static InstLane lookThroughShuffles(Use *U, int Lane) {
2802 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2803 unsigned NumElts =
2804 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2805 int M = SV->getMaskValue(Lane);
2806 if (M < 0)
2807 return {nullptr, PoisonMaskElem};
2808 if (static_cast<unsigned>(M) < NumElts) {
2809 U = &SV->getOperandUse(0);
2810 Lane = M;
2811 } else {
2812 U = &SV->getOperandUse(1);
2813 Lane = M - NumElts;
2814 }
2815 }
2816 return InstLane{U, Lane};
2817}
2818
2822 for (InstLane IL : Item) {
2823 auto [U, Lane] = IL;
2824 InstLane OpLane =
2825 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2826 Lane)
2827 : InstLane{nullptr, PoisonMaskElem};
2828 NItem.emplace_back(OpLane);
2829 }
2830 return NItem;
2831}
2832
2833/// Detect concat of multiple values into a vector
2835 const TargetTransformInfo &TTI) {
2836 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
2837 unsigned NumElts = Ty->getNumElements();
2838 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
2839 return false;
2840
2841 // Check that the concat is free, usually meaning that the type will be split
2842 // during legalization.
2843 SmallVector<int, 16> ConcatMask(NumElts * 2);
2844 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2845 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
2846 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
2847 Ty, ConcatMask, CostKind) != 0)
2848 return false;
2849
2850 unsigned NumSlices = Item.size() / NumElts;
2851 // Currently we generate a tree of shuffles for the concats, which limits us
2852 // to a power2.
2853 if (!isPowerOf2_32(NumSlices))
2854 return false;
2855 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
2856 Use *SliceV = Item[Slice * NumElts].first;
2857 if (!SliceV || SliceV->get()->getType() != Ty)
2858 return false;
2859 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
2860 auto [V, Lane] = Item[Slice * NumElts + Elt];
2861 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
2862 return false;
2863 }
2864 }
2865 return true;
2866}
2867
2869 const SmallPtrSet<Use *, 4> &IdentityLeafs,
2870 const SmallPtrSet<Use *, 4> &SplatLeafs,
2871 const SmallPtrSet<Use *, 4> &ConcatLeafs,
2872 IRBuilderBase &Builder,
2873 const TargetTransformInfo *TTI) {
2874 auto [FrontU, FrontLane] = Item.front();
2875
2876 if (IdentityLeafs.contains(FrontU)) {
2877 return FrontU->get();
2878 }
2879 if (SplatLeafs.contains(FrontU)) {
2880 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
2881 return Builder.CreateShuffleVector(FrontU->get(), Mask);
2882 }
2883 if (ConcatLeafs.contains(FrontU)) {
2884 unsigned NumElts =
2885 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
2886 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
2887 for (unsigned S = 0; S < Values.size(); ++S)
2888 Values[S] = Item[S * NumElts].first->get();
2889
2890 while (Values.size() > 1) {
2891 NumElts *= 2;
2892 SmallVector<int, 16> Mask(NumElts, 0);
2893 std::iota(Mask.begin(), Mask.end(), 0);
2894 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
2895 for (unsigned S = 0; S < NewValues.size(); ++S)
2896 NewValues[S] =
2897 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
2898 Values = NewValues;
2899 }
2900 return Values[0];
2901 }
2902
2903 auto *I = cast<Instruction>(FrontU->get());
2904 auto *II = dyn_cast<IntrinsicInst>(I);
2905 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
2907 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
2908 if (II &&
2909 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
2910 Ops[Idx] = II->getOperand(Idx);
2911 continue;
2912 }
2914 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
2915 Builder, TTI);
2916 }
2917
2918 SmallVector<Value *, 8> ValueList;
2919 for (const auto &Lane : Item)
2920 if (Lane.first)
2921 ValueList.push_back(Lane.first->get());
2922
2923 Type *DstTy =
2924 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
2925 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
2926 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
2927 Ops[0], Ops[1]);
2928 propagateIRFlags(Value, ValueList);
2929 return Value;
2930 }
2931 if (auto *CI = dyn_cast<CmpInst>(I)) {
2932 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
2933 propagateIRFlags(Value, ValueList);
2934 return Value;
2935 }
2936 if (auto *SI = dyn_cast<SelectInst>(I)) {
2937 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
2938 propagateIRFlags(Value, ValueList);
2939 return Value;
2940 }
2941 if (auto *CI = dyn_cast<CastInst>(I)) {
2942 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
2943 propagateIRFlags(Value, ValueList);
2944 return Value;
2945 }
2946 if (II) {
2947 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
2948 propagateIRFlags(Value, ValueList);
2949 return Value;
2950 }
2951 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
2952 auto *Value =
2953 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
2954 propagateIRFlags(Value, ValueList);
2955 return Value;
2956}
2957
2958// Starting from a shuffle, look up through operands tracking the shuffled index
2959// of each lane. If we can simplify away the shuffles to identities then
2960// do so.
2961bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
2962 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
2963 if (!Ty || I.use_empty())
2964 return false;
2965
2966 SmallVector<InstLane> Start(Ty->getNumElements());
2967 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
2968 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
2969
2971 Worklist.push_back(Start);
2972 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
2973 unsigned NumVisited = 0;
2974
2975 while (!Worklist.empty()) {
2976 if (++NumVisited > MaxInstrsToScan)
2977 return false;
2978
2979 SmallVector<InstLane> Item = Worklist.pop_back_val();
2980 auto [FrontU, FrontLane] = Item.front();
2981
2982 // If we found an undef first lane then bail out to keep things simple.
2983 if (!FrontU)
2984 return false;
2985
2986 // Helper to peek through bitcasts to the same value.
2987 auto IsEquiv = [&](Value *X, Value *Y) {
2988 return X->getType() == Y->getType() &&
2990 };
2991
2992 // Look for an identity value.
2993 if (FrontLane == 0 &&
2994 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
2995 Ty->getNumElements() &&
2996 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
2997 Value *FrontV = Item.front().first->get();
2998 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
2999 E.value().second == (int)E.index());
3000 })) {
3001 IdentityLeafs.insert(FrontU);
3002 continue;
3003 }
3004 // Look for constants, for the moment only supporting constant splats.
3005 if (auto *C = dyn_cast<Constant>(FrontU);
3006 C && C->getSplatValue() &&
3007 all_of(drop_begin(Item), [Item](InstLane &IL) {
3008 Value *FrontV = Item.front().first->get();
3009 Use *U = IL.first;
3010 return !U || (isa<Constant>(U->get()) &&
3011 cast<Constant>(U->get())->getSplatValue() ==
3012 cast<Constant>(FrontV)->getSplatValue());
3013 })) {
3014 SplatLeafs.insert(FrontU);
3015 continue;
3016 }
3017 // Look for a splat value.
3018 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3019 auto [FrontU, FrontLane] = Item.front();
3020 auto [U, Lane] = IL;
3021 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3022 })) {
3023 SplatLeafs.insert(FrontU);
3024 continue;
3025 }
3026
3027 // We need each element to be the same type of value, and check that each
3028 // element has a single use.
3029 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3030 Value *FrontV = Item.front().first->get();
3031 if (!IL.first)
3032 return true;
3033 Value *V = IL.first->get();
3034 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3035 return false;
3036 if (V->getValueID() != FrontV->getValueID())
3037 return false;
3038 if (auto *CI = dyn_cast<CmpInst>(V))
3039 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3040 return false;
3041 if (auto *CI = dyn_cast<CastInst>(V))
3042 if (CI->getSrcTy()->getScalarType() !=
3043 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3044 return false;
3045 if (auto *SI = dyn_cast<SelectInst>(V))
3046 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3047 SI->getOperand(0)->getType() !=
3048 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3049 return false;
3050 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3051 return false;
3052 auto *II = dyn_cast<IntrinsicInst>(V);
3053 return !II || (isa<IntrinsicInst>(FrontV) &&
3054 II->getIntrinsicID() ==
3055 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3056 !II->hasOperandBundles());
3057 };
3058 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3059 // Check the operator is one that we support.
3060 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3061 // We exclude div/rem in case they hit UB from poison lanes.
3062 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3063 BO && BO->isIntDivRem())
3064 return false;
3067 continue;
3068 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3069 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3071 continue;
3072 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3073 // TODO: Handle vector widening/narrowing bitcasts.
3074 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3075 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3076 if (DstTy && SrcTy &&
3077 SrcTy->getNumElements() == DstTy->getNumElements()) {
3079 continue;
3080 }
3081 } else if (isa<SelectInst>(FrontU)) {
3085 continue;
3086 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3087 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3088 !II->hasOperandBundles()) {
3089 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3090 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3091 &TTI)) {
3092 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3093 Value *FrontV = Item.front().first->get();
3094 Use *U = IL.first;
3095 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3096 cast<Instruction>(FrontV)->getOperand(Op));
3097 }))
3098 return false;
3099 continue;
3100 }
3102 }
3103 continue;
3104 }
3105 }
3106
3107 if (isFreeConcat(Item, CostKind, TTI)) {
3108 ConcatLeafs.insert(FrontU);
3109 continue;
3110 }
3111
3112 return false;
3113 }
3114
3115 if (NumVisited <= 1)
3116 return false;
3117
3118 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3119
3120 // If we got this far, we know the shuffles are superfluous and can be
3121 // removed. Scan through again and generate the new tree of instructions.
3122 Builder.SetInsertPoint(&I);
3123 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3124 ConcatLeafs, Builder, &TTI);
3125 replaceValue(I, *V);
3126 return true;
3127}
3128
3129/// Given a commutative reduction, the order of the input lanes does not alter
3130/// the results. We can use this to remove certain shuffles feeding the
3131/// reduction, removing the need to shuffle at all.
3132bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3133 auto *II = dyn_cast<IntrinsicInst>(&I);
3134 if (!II)
3135 return false;
3136 switch (II->getIntrinsicID()) {
3137 case Intrinsic::vector_reduce_add:
3138 case Intrinsic::vector_reduce_mul:
3139 case Intrinsic::vector_reduce_and:
3140 case Intrinsic::vector_reduce_or:
3141 case Intrinsic::vector_reduce_xor:
3142 case Intrinsic::vector_reduce_smin:
3143 case Intrinsic::vector_reduce_smax:
3144 case Intrinsic::vector_reduce_umin:
3145 case Intrinsic::vector_reduce_umax:
3146 break;
3147 default:
3148 return false;
3149 }
3150
3151 // Find all the inputs when looking through operations that do not alter the
3152 // lane order (binops, for example). Currently we look for a single shuffle,
3153 // and can ignore splat values.
3154 std::queue<Value *> Worklist;
3155 SmallPtrSet<Value *, 4> Visited;
3156 ShuffleVectorInst *Shuffle = nullptr;
3157 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3158 Worklist.push(Op);
3159
3160 while (!Worklist.empty()) {
3161 Value *CV = Worklist.front();
3162 Worklist.pop();
3163 if (Visited.contains(CV))
3164 continue;
3165
3166 // Splats don't change the order, so can be safely ignored.
3167 if (isSplatValue(CV))
3168 continue;
3169
3170 Visited.insert(CV);
3171
3172 if (auto *CI = dyn_cast<Instruction>(CV)) {
3173 if (CI->isBinaryOp()) {
3174 for (auto *Op : CI->operand_values())
3175 Worklist.push(Op);
3176 continue;
3177 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3178 if (Shuffle && Shuffle != SV)
3179 return false;
3180 Shuffle = SV;
3181 continue;
3182 }
3183 }
3184
3185 // Anything else is currently an unknown node.
3186 return false;
3187 }
3188
3189 if (!Shuffle)
3190 return false;
3191
3192 // Check all uses of the binary ops and shuffles are also included in the
3193 // lane-invariant operations (Visited should be the list of lanewise
3194 // instructions, including the shuffle that we found).
3195 for (auto *V : Visited)
3196 for (auto *U : V->users())
3197 if (!Visited.contains(U) && U != &I)
3198 return false;
3199
3200 FixedVectorType *VecType =
3201 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3202 if (!VecType)
3203 return false;
3204 FixedVectorType *ShuffleInputType =
3206 if (!ShuffleInputType)
3207 return false;
3208 unsigned NumInputElts = ShuffleInputType->getNumElements();
3209
3210 // Find the mask from sorting the lanes into order. This is most likely to
3211 // become a identity or concat mask. Undef elements are pushed to the end.
3212 SmallVector<int> ConcatMask;
3213 Shuffle->getShuffleMask(ConcatMask);
3214 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3215 bool UsesSecondVec =
3216 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3217
3219 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3220 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3222 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3223 ShuffleInputType, ConcatMask, CostKind);
3224
3225 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3226 << "\n");
3227 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3228 << "\n");
3229 bool MadeChanges = false;
3230 if (NewCost < OldCost) {
3231 Builder.SetInsertPoint(Shuffle);
3232 Value *NewShuffle = Builder.CreateShuffleVector(
3233 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3234 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3235 replaceValue(*Shuffle, *NewShuffle);
3236 return true;
3237 }
3238
3239 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3240 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3241 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3242 return MadeChanges;
3243}
3244
3245/// For a given chain of patterns of the following form:
3246///
3247/// ```
3248/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3249///
3250/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3251/// ty1> %1)
3252/// OR
3253/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3254///
3255/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3256/// ...
3257/// ...
3258/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3259/// 3), <n x ty1> %(i - 2)
3260/// OR
3261/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3262///
3263/// %(i) = extractelement <n x ty1> %(i - 1), 0
3264/// ```
3265///
3266/// Where:
3267/// `mask` follows a partition pattern:
3268///
3269/// Ex:
3270/// [n = 8, p = poison]
3271///
3272/// 4 5 6 7 | p p p p
3273/// 2 3 | p p p p p p
3274/// 1 | p p p p p p p
3275///
3276/// For powers of 2, there's a consistent pattern, but for other cases
3277/// the parity of the current half value at each step decides the
3278/// next partition half (see `ExpectedParityMask` for more logical details
3279/// in generalising this).
3280///
3281/// Ex:
3282/// [n = 6]
3283///
3284/// 3 4 5 | p p p
3285/// 1 2 | p p p p
3286/// 1 | p p p p p
3287bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3288 // Going bottom-up for the pattern.
3289 std::queue<Value *> InstWorklist;
3290 InstructionCost OrigCost = 0;
3291
3292 // Common instruction operation after each shuffle op.
3293 std::optional<unsigned int> CommonCallOp = std::nullopt;
3294 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3295
3296 bool IsFirstCallOrBinInst = true;
3297 bool ShouldBeCallOrBinInst = true;
3298
3299 // This stores the last used instructions for shuffle/common op.
3300 //
3301 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3302 // instructions from either shuffle/common op.
3303 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3304
3305 Value *VecOpEE;
3306 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3307 return false;
3308
3309 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3310 if (!FVT)
3311 return false;
3312
3313 int64_t VecSize = FVT->getNumElements();
3314 if (VecSize < 2)
3315 return false;
3316
3317 // Number of levels would be ~log2(n), considering we always partition
3318 // by half for this fold pattern.
3319 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3320 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3321
3322 // This is how we generalise for all element sizes.
3323 // At each step, if vector size is odd, we need non-poison
3324 // values to cover the dominant half so we don't miss out on any element.
3325 //
3326 // This mask will help us retrieve this as we go from bottom to top:
3327 //
3328 // Mask Set -> N = N * 2 - 1
3329 // Mask Unset -> N = N * 2
3330 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3331 Cur = (Cur + 1) / 2, --Mask) {
3332 if (Cur & 1)
3333 ExpectedParityMask |= (1ll << Mask);
3334 }
3335
3336 InstWorklist.push(VecOpEE);
3337
3338 while (!InstWorklist.empty()) {
3339 Value *CI = InstWorklist.front();
3340 InstWorklist.pop();
3341
3342 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3343 if (!ShouldBeCallOrBinInst)
3344 return false;
3345
3346 if (!IsFirstCallOrBinInst &&
3347 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3348 return false;
3349
3350 // For the first found call/bin op, the vector has to come from the
3351 // extract element op.
3352 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3353 return false;
3354 IsFirstCallOrBinInst = false;
3355
3356 if (!CommonCallOp)
3357 CommonCallOp = II->getIntrinsicID();
3358 if (II->getIntrinsicID() != *CommonCallOp)
3359 return false;
3360
3361 switch (II->getIntrinsicID()) {
3362 case Intrinsic::umin:
3363 case Intrinsic::umax:
3364 case Intrinsic::smin:
3365 case Intrinsic::smax: {
3366 auto *Op0 = II->getOperand(0);
3367 auto *Op1 = II->getOperand(1);
3368 PrevVecV[0] = Op0;
3369 PrevVecV[1] = Op1;
3370 break;
3371 }
3372 default:
3373 return false;
3374 }
3375 ShouldBeCallOrBinInst ^= 1;
3376
3377 IntrinsicCostAttributes ICA(
3378 *CommonCallOp, II->getType(),
3379 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3380 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3381
3382 // We may need a swap here since it can be (a, b) or (b, a)
3383 // and accordingly change as we go up.
3384 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3385 std::swap(PrevVecV[0], PrevVecV[1]);
3386 InstWorklist.push(PrevVecV[1]);
3387 InstWorklist.push(PrevVecV[0]);
3388 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3389 // Similar logic for bin ops.
3390
3391 if (!ShouldBeCallOrBinInst)
3392 return false;
3393
3394 if (!IsFirstCallOrBinInst &&
3395 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3396 return false;
3397
3398 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3399 return false;
3400 IsFirstCallOrBinInst = false;
3401
3402 if (!CommonBinOp)
3403 CommonBinOp = BinOp->getOpcode();
3404
3405 if (BinOp->getOpcode() != *CommonBinOp)
3406 return false;
3407
3408 switch (*CommonBinOp) {
3409 case BinaryOperator::Add:
3410 case BinaryOperator::Mul:
3411 case BinaryOperator::Or:
3412 case BinaryOperator::And:
3413 case BinaryOperator::Xor: {
3414 auto *Op0 = BinOp->getOperand(0);
3415 auto *Op1 = BinOp->getOperand(1);
3416 PrevVecV[0] = Op0;
3417 PrevVecV[1] = Op1;
3418 break;
3419 }
3420 default:
3421 return false;
3422 }
3423 ShouldBeCallOrBinInst ^= 1;
3424
3425 OrigCost +=
3426 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3427
3428 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3429 std::swap(PrevVecV[0], PrevVecV[1]);
3430 InstWorklist.push(PrevVecV[1]);
3431 InstWorklist.push(PrevVecV[0]);
3432 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3433 // We shouldn't have any null values in the previous vectors,
3434 // is so, there was a mismatch in pattern.
3435 if (ShouldBeCallOrBinInst ||
3436 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3437 return false;
3438
3439 if (SVInst != PrevVecV[1])
3440 return false;
3441
3442 ArrayRef<int> CurMask;
3443 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3444 m_Mask(CurMask))))
3445 return false;
3446
3447 // Subtract the parity mask when checking the condition.
3448 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3449 if (Mask < ShuffleMaskHalf &&
3450 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3451 return false;
3452 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3453 return false;
3454 }
3455
3456 // Update mask values.
3457 ShuffleMaskHalf *= 2;
3458 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3459 ExpectedParityMask >>= 1;
3460
3462 SVInst->getType(), SVInst->getType(),
3463 CurMask, CostKind);
3464
3465 VisitedCnt += 1;
3466 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3467 break;
3468
3469 ShouldBeCallOrBinInst ^= 1;
3470 } else {
3471 return false;
3472 }
3473 }
3474
3475 // Pattern should end with a shuffle op.
3476 if (ShouldBeCallOrBinInst)
3477 return false;
3478
3479 assert(VecSize != -1 && "Expected Match for Vector Size");
3480
3481 Value *FinalVecV = PrevVecV[0];
3482 if (!FinalVecV)
3483 return false;
3484
3485 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3486
3487 Intrinsic::ID ReducedOp =
3488 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3489 : getReductionForBinop(*CommonBinOp));
3490 if (!ReducedOp)
3491 return false;
3492
3493 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3495
3496 if (NewCost >= OrigCost)
3497 return false;
3498
3499 auto *ReducedResult =
3500 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3501 replaceValue(I, *ReducedResult);
3502
3503 return true;
3504}
3505
3506/// Determine if its more efficient to fold:
3507/// reduce(trunc(x)) -> trunc(reduce(x)).
3508/// reduce(sext(x)) -> sext(reduce(x)).
3509/// reduce(zext(x)) -> zext(reduce(x)).
3510bool VectorCombine::foldCastFromReductions(Instruction &I) {
3511 auto *II = dyn_cast<IntrinsicInst>(&I);
3512 if (!II)
3513 return false;
3514
3515 bool TruncOnly = false;
3516 Intrinsic::ID IID = II->getIntrinsicID();
3517 switch (IID) {
3518 case Intrinsic::vector_reduce_add:
3519 case Intrinsic::vector_reduce_mul:
3520 TruncOnly = true;
3521 break;
3522 case Intrinsic::vector_reduce_and:
3523 case Intrinsic::vector_reduce_or:
3524 case Intrinsic::vector_reduce_xor:
3525 break;
3526 default:
3527 return false;
3528 }
3529
3530 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3531 Value *ReductionSrc = I.getOperand(0);
3532
3533 Value *Src;
3534 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3535 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3536 return false;
3537
3538 auto CastOpc =
3539 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3540
3541 auto *SrcTy = cast<VectorType>(Src->getType());
3542 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3543 Type *ResultTy = I.getType();
3544
3546 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3547 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3549 cast<CastInst>(ReductionSrc));
3550 InstructionCost NewCost =
3551 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3552 CostKind) +
3553 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3555
3556 if (OldCost <= NewCost || !NewCost.isValid())
3557 return false;
3558
3559 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3560 II->getIntrinsicID(), {Src});
3561 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3562 replaceValue(I, *NewCast);
3563 return true;
3564}
3565
3566/// Returns true if this ShuffleVectorInst eventually feeds into a
3567/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3568/// chains of shuffles and binary operators (in any combination/order).
3569/// The search does not go deeper than the given Depth.
3571 constexpr unsigned MaxVisited = 32;
3574 bool FoundReduction = false;
3575
3576 WorkList.push_back(SVI);
3577 while (!WorkList.empty()) {
3578 Instruction *I = WorkList.pop_back_val();
3579 for (User *U : I->users()) {
3580 auto *UI = cast<Instruction>(U);
3581 if (!UI || !Visited.insert(UI).second)
3582 continue;
3583 if (Visited.size() > MaxVisited)
3584 return false;
3585 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3586 // More than one reduction reached
3587 if (FoundReduction)
3588 return false;
3589 switch (II->getIntrinsicID()) {
3590 case Intrinsic::vector_reduce_add:
3591 case Intrinsic::vector_reduce_mul:
3592 case Intrinsic::vector_reduce_and:
3593 case Intrinsic::vector_reduce_or:
3594 case Intrinsic::vector_reduce_xor:
3595 case Intrinsic::vector_reduce_smin:
3596 case Intrinsic::vector_reduce_smax:
3597 case Intrinsic::vector_reduce_umin:
3598 case Intrinsic::vector_reduce_umax:
3599 FoundReduction = true;
3600 continue;
3601 default:
3602 return false;
3603 }
3604 }
3605
3607 return false;
3608
3609 WorkList.emplace_back(UI);
3610 }
3611 }
3612 return FoundReduction;
3613}
3614
3615/// This method looks for groups of shuffles acting on binops, of the form:
3616/// %x = shuffle ...
3617/// %y = shuffle ...
3618/// %a = binop %x, %y
3619/// %b = binop %x, %y
3620/// shuffle %a, %b, selectmask
3621/// We may, especially if the shuffle is wider than legal, be able to convert
3622/// the shuffle to a form where only parts of a and b need to be computed. On
3623/// architectures with no obvious "select" shuffle, this can reduce the total
3624/// number of operations if the target reports them as cheaper.
3625bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3626 auto *SVI = cast<ShuffleVectorInst>(&I);
3627 auto *VT = cast<FixedVectorType>(I.getType());
3628 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3629 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3630 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3631 VT != Op0->getType())
3632 return false;
3633
3634 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3635 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3636 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3637 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3638 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3639 auto checkSVNonOpUses = [&](Instruction *I) {
3640 if (!I || I->getOperand(0)->getType() != VT)
3641 return true;
3642 return any_of(I->users(), [&](User *U) {
3643 return U != Op0 && U != Op1 &&
3644 !(isa<ShuffleVectorInst>(U) &&
3645 (InputShuffles.contains(cast<Instruction>(U)) ||
3646 isInstructionTriviallyDead(cast<Instruction>(U))));
3647 });
3648 };
3649 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3650 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3651 return false;
3652
3653 // Collect all the uses that are shuffles that we can transform together. We
3654 // may not have a single shuffle, but a group that can all be transformed
3655 // together profitably.
3657 auto collectShuffles = [&](Instruction *I) {
3658 for (auto *U : I->users()) {
3659 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3660 if (!SV || SV->getType() != VT)
3661 return false;
3662 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3663 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3664 return false;
3665 if (!llvm::is_contained(Shuffles, SV))
3666 Shuffles.push_back(SV);
3667 }
3668 return true;
3669 };
3670 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3671 return false;
3672 // From a reduction, we need to be processing a single shuffle, otherwise the
3673 // other uses will not be lane-invariant.
3674 if (FromReduction && Shuffles.size() > 1)
3675 return false;
3676
3677 // Add any shuffle uses for the shuffles we have found, to include them in our
3678 // cost calculations.
3679 if (!FromReduction) {
3680 for (ShuffleVectorInst *SV : Shuffles) {
3681 for (auto *U : SV->users()) {
3682 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3683 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3684 Shuffles.push_back(SSV);
3685 }
3686 }
3687 }
3688
3689 // For each of the output shuffles, we try to sort all the first vector
3690 // elements to the beginning, followed by the second array elements at the
3691 // end. If the binops are legalized to smaller vectors, this may reduce total
3692 // number of binops. We compute the ReconstructMask mask needed to convert
3693 // back to the original lane order.
3695 SmallVector<SmallVector<int>> OrigReconstructMasks;
3696 int MaxV1Elt = 0, MaxV2Elt = 0;
3697 unsigned NumElts = VT->getNumElements();
3698 for (ShuffleVectorInst *SVN : Shuffles) {
3699 SmallVector<int> Mask;
3700 SVN->getShuffleMask(Mask);
3701
3702 // Check the operands are the same as the original, or reversed (in which
3703 // case we need to commute the mask).
3704 Value *SVOp0 = SVN->getOperand(0);
3705 Value *SVOp1 = SVN->getOperand(1);
3706 if (isa<UndefValue>(SVOp1)) {
3707 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3708 SVOp0 = SSV->getOperand(0);
3709 SVOp1 = SSV->getOperand(1);
3710 for (int &Elem : Mask) {
3711 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3712 return false;
3713 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3714 }
3715 }
3716 if (SVOp0 == Op1 && SVOp1 == Op0) {
3717 std::swap(SVOp0, SVOp1);
3719 }
3720 if (SVOp0 != Op0 || SVOp1 != Op1)
3721 return false;
3722
3723 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3724 // take the packed values from Op0/Op1 and reconstructing to the original
3725 // order.
3726 SmallVector<int> ReconstructMask;
3727 for (unsigned I = 0; I < Mask.size(); I++) {
3728 if (Mask[I] < 0) {
3729 ReconstructMask.push_back(-1);
3730 } else if (Mask[I] < static_cast<int>(NumElts)) {
3731 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3732 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3733 return Mask[I] == A.first;
3734 });
3735 if (It != V1.end())
3736 ReconstructMask.push_back(It - V1.begin());
3737 else {
3738 ReconstructMask.push_back(V1.size());
3739 V1.emplace_back(Mask[I], V1.size());
3740 }
3741 } else {
3742 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3743 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3744 return Mask[I] - static_cast<int>(NumElts) == A.first;
3745 });
3746 if (It != V2.end())
3747 ReconstructMask.push_back(NumElts + It - V2.begin());
3748 else {
3749 ReconstructMask.push_back(NumElts + V2.size());
3750 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3751 }
3752 }
3753 }
3754
3755 // For reductions, we know that the lane ordering out doesn't alter the
3756 // result. In-order can help simplify the shuffle away.
3757 if (FromReduction)
3758 sort(ReconstructMask);
3759 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3760 }
3761
3762 // If the Maximum element used from V1 and V2 are not larger than the new
3763 // vectors, the vectors are already packes and performing the optimization
3764 // again will likely not help any further. This also prevents us from getting
3765 // stuck in a cycle in case the costs do not also rule it out.
3766 if (V1.empty() || V2.empty() ||
3767 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3768 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3769 return false;
3770
3771 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3772 // shuffle of another shuffle, or not a shuffle (that is treated like a
3773 // identity shuffle).
3774 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3775 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3776 if (!SV)
3777 return M;
3778 if (isa<UndefValue>(SV->getOperand(1)))
3779 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3780 if (InputShuffles.contains(SSV))
3781 return SSV->getMaskValue(SV->getMaskValue(M));
3782 return SV->getMaskValue(M);
3783 };
3784
3785 // Attempt to sort the inputs my ascending mask values to make simpler input
3786 // shuffles and push complex shuffles down to the uses. We sort on the first
3787 // of the two input shuffle orders, to try and get at least one input into a
3788 // nice order.
3789 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3790 std::pair<int, int> Y) {
3791 int MXA = GetBaseMaskValue(A, X.first);
3792 int MYA = GetBaseMaskValue(A, Y.first);
3793 return MXA < MYA;
3794 };
3795 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3796 return SortBase(SVI0A, A, B);
3797 });
3798 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3799 return SortBase(SVI1A, A, B);
3800 });
3801 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3802 // modified order of the input shuffles.
3803 SmallVector<SmallVector<int>> ReconstructMasks;
3804 for (const auto &Mask : OrigReconstructMasks) {
3805 SmallVector<int> ReconstructMask;
3806 for (int M : Mask) {
3807 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3808 auto It = find_if(V, [M](auto A) { return A.second == M; });
3809 assert(It != V.end() && "Expected all entries in Mask");
3810 return std::distance(V.begin(), It);
3811 };
3812 if (M < 0)
3813 ReconstructMask.push_back(-1);
3814 else if (M < static_cast<int>(NumElts)) {
3815 ReconstructMask.push_back(FindIndex(V1, M));
3816 } else {
3817 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3818 }
3819 }
3820 ReconstructMasks.push_back(std::move(ReconstructMask));
3821 }
3822
3823 // Calculate the masks needed for the new input shuffles, which get padded
3824 // with undef
3825 SmallVector<int> V1A, V1B, V2A, V2B;
3826 for (unsigned I = 0; I < V1.size(); I++) {
3827 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3828 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3829 }
3830 for (unsigned I = 0; I < V2.size(); I++) {
3831 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3832 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3833 }
3834 while (V1A.size() < NumElts) {
3837 }
3838 while (V2A.size() < NumElts) {
3841 }
3842
3843 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
3844 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3845 if (!SV)
3846 return C;
3847 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
3850 VT, VT, SV->getShuffleMask(), CostKind);
3851 };
3852 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3853 return C +
3855 };
3856
3857 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
3858 unsigned MaxVectorSize =
3860 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
3861 if (MaxElementsInVector == 0)
3862 return false;
3863 // When there are multiple shufflevector operations on the same input,
3864 // especially when the vector length is larger than the register size,
3865 // identical shuffle patterns may occur across different groups of elements.
3866 // To avoid overestimating the cost by counting these repeated shuffles more
3867 // than once, we only account for unique shuffle patterns. This adjustment
3868 // prevents inflated costs in the cost model for wide vectors split into
3869 // several register-sized groups.
3870 std::set<SmallVector<int, 4>> UniqueShuffles;
3871 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3872 // Compute the cost for performing the shuffle over the full vector.
3873 auto ShuffleCost =
3875 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
3876 if (NumFullVectors < 2)
3877 return C + ShuffleCost;
3878 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
3879 unsigned NumUniqueGroups = 0;
3880 unsigned NumGroups = Mask.size() / MaxElementsInVector;
3881 // For each group of MaxElementsInVector contiguous elements,
3882 // collect their shuffle pattern and insert into the set of unique patterns.
3883 for (unsigned I = 0; I < NumFullVectors; ++I) {
3884 for (unsigned J = 0; J < MaxElementsInVector; ++J)
3885 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
3886 if (UniqueShuffles.insert(SubShuffle).second)
3887 NumUniqueGroups += 1;
3888 }
3889 return C + ShuffleCost * NumUniqueGroups / NumGroups;
3890 };
3891 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
3892 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3893 if (!SV)
3894 return C;
3895 SmallVector<int, 16> Mask;
3896 SV->getShuffleMask(Mask);
3897 return AddShuffleMaskAdjustedCost(C, Mask);
3898 };
3899 // Check that input consists of ShuffleVectors applied to the same input
3900 auto AllShufflesHaveSameOperands =
3901 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
3902 if (InputShuffles.size() < 2)
3903 return false;
3904 ShuffleVectorInst *FirstSV =
3905 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
3906 if (!FirstSV)
3907 return false;
3908
3909 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
3910 return std::all_of(
3911 std::next(InputShuffles.begin()), InputShuffles.end(),
3912 [&](Instruction *I) {
3913 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
3914 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
3915 });
3916 };
3917
3918 // Get the costs of the shuffles + binops before and after with the new
3919 // shuffle masks.
3920 InstructionCost CostBefore =
3921 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
3922 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
3923 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
3924 InstructionCost(0), AddShuffleCost);
3925 if (AllShufflesHaveSameOperands(InputShuffles)) {
3926 UniqueShuffles.clear();
3927 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3928 InstructionCost(0), AddShuffleAdjustedCost);
3929 } else {
3930 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3931 InstructionCost(0), AddShuffleCost);
3932 }
3933
3934 // The new binops will be unused for lanes past the used shuffle lengths.
3935 // These types attempt to get the correct cost for that from the target.
3936 FixedVectorType *Op0SmallVT =
3937 FixedVectorType::get(VT->getScalarType(), V1.size());
3938 FixedVectorType *Op1SmallVT =
3939 FixedVectorType::get(VT->getScalarType(), V2.size());
3940 InstructionCost CostAfter =
3941 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
3942 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
3943 UniqueShuffles.clear();
3944 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
3945 InstructionCost(0), AddShuffleMaskAdjustedCost);
3946 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
3947 CostAfter +=
3948 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
3949 InstructionCost(0), AddShuffleMaskCost);
3950
3951 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
3952 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
3953 << " vs CostAfter: " << CostAfter << "\n");
3954 if (CostBefore < CostAfter ||
3955 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
3956 return false;
3957
3958 // The cost model has passed, create the new instructions.
3959 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
3960 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3961 if (!SV)
3962 return I;
3963 if (isa<UndefValue>(SV->getOperand(1)))
3964 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3965 if (InputShuffles.contains(SSV))
3966 return SSV->getOperand(Op);
3967 return SV->getOperand(Op);
3968 };
3969 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
3970 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
3971 GetShuffleOperand(SVI0A, 1), V1A);
3972 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
3973 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
3974 GetShuffleOperand(SVI0B, 1), V1B);
3975 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
3976 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
3977 GetShuffleOperand(SVI1A, 1), V2A);
3978 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
3979 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
3980 GetShuffleOperand(SVI1B, 1), V2B);
3981 Builder.SetInsertPoint(Op0);
3982 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
3983 NSV0A, NSV0B);
3984 if (auto *I = dyn_cast<Instruction>(NOp0))
3985 I->copyIRFlags(Op0, true);
3986 Builder.SetInsertPoint(Op1);
3987 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
3988 NSV1A, NSV1B);
3989 if (auto *I = dyn_cast<Instruction>(NOp1))
3990 I->copyIRFlags(Op1, true);
3991
3992 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
3993 Builder.SetInsertPoint(Shuffles[S]);
3994 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
3995 replaceValue(*Shuffles[S], *NSV, false);
3996 }
3997
3998 Worklist.pushValue(NSV0A);
3999 Worklist.pushValue(NSV0B);
4000 Worklist.pushValue(NSV1A);
4001 Worklist.pushValue(NSV1B);
4002 return true;
4003}
4004
4005/// Check if instruction depends on ZExt and this ZExt can be moved after the
4006/// instruction. Move ZExt if it is profitable. For example:
4007/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4008/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4009/// Cost model calculations takes into account if zext(x) has other users and
4010/// whether it can be propagated through them too.
4011bool VectorCombine::shrinkType(Instruction &I) {
4012 Value *ZExted, *OtherOperand;
4013 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4014 m_Value(OtherOperand))) &&
4015 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4016 return false;
4017
4018 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4019
4020 auto *BigTy = cast<FixedVectorType>(I.getType());
4021 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4022 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4023
4024 if (I.getOpcode() == Instruction::LShr) {
4025 // Check that the shift amount is less than the number of bits in the
4026 // smaller type. Otherwise, the smaller lshr will return a poison value.
4027 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4028 if (ShAmtKB.getMaxValue().uge(BW))
4029 return false;
4030 } else {
4031 // Check that the expression overall uses at most the same number of bits as
4032 // ZExted
4033 KnownBits KB = computeKnownBits(&I, *DL);
4034 if (KB.countMaxActiveBits() > BW)
4035 return false;
4036 }
4037
4038 // Calculate costs of leaving current IR as it is and moving ZExt operation
4039 // later, along with adding truncates if needed
4041 Instruction::ZExt, BigTy, SmallTy,
4042 TargetTransformInfo::CastContextHint::None, CostKind);
4043 InstructionCost CurrentCost = ZExtCost;
4044 InstructionCost ShrinkCost = 0;
4045
4046 // Calculate total cost and check that we can propagate through all ZExt users
4047 for (User *U : ZExtOperand->users()) {
4048 auto *UI = cast<Instruction>(U);
4049 if (UI == &I) {
4050 CurrentCost +=
4051 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4052 ShrinkCost +=
4053 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4054 ShrinkCost += ZExtCost;
4055 continue;
4056 }
4057
4058 if (!Instruction::isBinaryOp(UI->getOpcode()))
4059 return false;
4060
4061 // Check if we can propagate ZExt through its other users
4062 KnownBits KB = computeKnownBits(UI, *DL);
4063 if (KB.countMaxActiveBits() > BW)
4064 return false;
4065
4066 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4067 ShrinkCost +=
4068 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4069 ShrinkCost += ZExtCost;
4070 }
4071
4072 // If the other instruction operand is not a constant, we'll need to
4073 // generate a truncate instruction. So we have to adjust cost
4074 if (!isa<Constant>(OtherOperand))
4075 ShrinkCost += TTI.getCastInstrCost(
4076 Instruction::Trunc, SmallTy, BigTy,
4077 TargetTransformInfo::CastContextHint::None, CostKind);
4078
4079 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4080 // towards modifying the IR because shrinking opens opportunities for other
4081 // shrinking optimisations.
4082 if (ShrinkCost > CurrentCost)
4083 return false;
4084
4085 Builder.SetInsertPoint(&I);
4086 Value *Op0 = ZExted;
4087 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4088 // Keep the order of operands the same
4089 if (I.getOperand(0) == OtherOperand)
4090 std::swap(Op0, Op1);
4091 Value *NewBinOp =
4092 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4093 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4094 cast<Instruction>(NewBinOp)->copyMetadata(I);
4095 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4096 replaceValue(I, *NewZExtr);
4097 return true;
4098}
4099
4100/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4101/// shuffle (DstVec, SrcVec, Mask)
4102bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4103 Value *DstVec, *SrcVec;
4104 uint64_t ExtIdx, InsIdx;
4105 if (!match(&I,
4106 m_InsertElt(m_Value(DstVec),
4107 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4108 m_ConstantInt(InsIdx))))
4109 return false;
4110
4111 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4112 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4113 // We can try combining vectors with different element sizes.
4114 if (!DstVecTy || !SrcVecTy ||
4115 SrcVecTy->getElementType() != DstVecTy->getElementType())
4116 return false;
4117
4118 unsigned NumDstElts = DstVecTy->getNumElements();
4119 unsigned NumSrcElts = SrcVecTy->getNumElements();
4120 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4121 return false;
4122
4123 // Insertion into poison is a cheaper single operand shuffle.
4125 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4126
4127 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4128 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4129 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4130 if (NeedDstSrcSwap) {
4132 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4133 Mask[InsIdx] = 0;
4134 else
4135 Mask[InsIdx] = ExtIdx;
4136 std::swap(DstVec, SrcVec);
4137 } else {
4139 std::iota(Mask.begin(), Mask.end(), 0);
4140 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4141 Mask[InsIdx] = NumDstElts;
4142 else
4143 Mask[InsIdx] = ExtIdx + NumDstElts;
4144 }
4145
4146 // Cost
4147 auto *Ins = cast<InsertElementInst>(&I);
4148 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4149 InstructionCost InsCost =
4150 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4151 InstructionCost ExtCost =
4152 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4153 InstructionCost OldCost = ExtCost + InsCost;
4154
4155 InstructionCost NewCost = 0;
4156 SmallVector<int> ExtToVecMask;
4157 if (!NeedExpOrNarrow) {
4158 // Ignore 'free' identity insertion shuffle.
4159 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4160 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4161 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4162 nullptr, {DstVec, SrcVec});
4163 } else {
4164 // When creating length-changing-vector, always create with a Mask whose
4165 // first element has an ExtIdx, so that the first element of the vector
4166 // being created is always the target to be extracted.
4167 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4168 if (IsExtIdxInBounds)
4169 ExtToVecMask[ExtIdx] = ExtIdx;
4170 else
4171 ExtToVecMask[0] = ExtIdx;
4172 // Add cost for expanding or narrowing
4174 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4175 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4176 }
4177
4178 if (!Ext->hasOneUse())
4179 NewCost += ExtCost;
4180
4181 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4182 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4183 << "\n");
4184
4185 if (OldCost < NewCost)
4186 return false;
4187
4188 if (NeedExpOrNarrow) {
4189 if (!NeedDstSrcSwap)
4190 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4191 else
4192 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4193 }
4194
4195 // Canonicalize undef param to RHS to help further folds.
4196 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4197 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4198 std::swap(DstVec, SrcVec);
4199 }
4200
4201 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4202 replaceValue(I, *Shuf);
4203
4204 return true;
4205}
4206
4207/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4208/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4209/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4210/// before casting it back into `<vscale x 16 x i32>`.
4211bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4212 const APInt *SplatVal0, *SplatVal1;
4214 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4215 return false;
4216
4217 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4218 << "\n");
4219
4220 auto *VTy =
4221 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4222 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4223 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4224
4225 // Just in case the cost of interleave2 intrinsic and bitcast are both
4226 // invalid, in which case we want to bail out, we use <= rather
4227 // than < here. Even they both have valid and equal costs, it's probably
4228 // not a good idea to emit a high-cost constant splat.
4230 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4232 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4233 << *I.getType() << " is too high.\n");
4234 return false;
4235 }
4236
4237 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4238 NewSplatVal <<= Width;
4239 NewSplatVal |= SplatVal0->zext(Width * 2);
4240 auto *NewSplat = ConstantVector::getSplat(
4241 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4242
4243 IRBuilder<> Builder(&I);
4244 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4245 return true;
4246}
4247
4248// Attempt to shrink loads that are only used by shufflevector instructions.
4249bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4250 auto *OldLoad = dyn_cast<LoadInst>(&I);
4251 if (!OldLoad || !OldLoad->isSimple())
4252 return false;
4253
4254 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4255 if (!OldLoadTy)
4256 return false;
4257
4258 unsigned const OldNumElements = OldLoadTy->getNumElements();
4259
4260 // Search all uses of load. If all uses are shufflevector instructions, and
4261 // the second operands are all poison values, find the minimum and maximum
4262 // indices of the vector elements referenced by all shuffle masks.
4263 // Otherwise return `std::nullopt`.
4264 using IndexRange = std::pair<int, int>;
4265 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4266 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4267 for (llvm::Use &Use : I.uses()) {
4268 // Ensure all uses match the required pattern.
4269 User *Shuffle = Use.getUser();
4270 ArrayRef<int> Mask;
4271
4272 if (!match(Shuffle,
4273 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4274 return std::nullopt;
4275
4276 // Ignore shufflevector instructions that have no uses.
4277 if (Shuffle->use_empty())
4278 continue;
4279
4280 // Find the min and max indices used by the shufflevector instruction.
4281 for (int Index : Mask) {
4282 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4283 OutputRange.first = std::min(Index, OutputRange.first);
4284 OutputRange.second = std::max(Index, OutputRange.second);
4285 }
4286 }
4287 }
4288
4289 if (OutputRange.second < OutputRange.first)
4290 return std::nullopt;
4291
4292 return OutputRange;
4293 };
4294
4295 // Get the range of vector elements used by shufflevector instructions.
4296 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4297 unsigned const NewNumElements = Indices->second + 1u;
4298
4299 // If the range of vector elements is smaller than the full load, attempt
4300 // to create a smaller load.
4301 if (NewNumElements < OldNumElements) {
4302 IRBuilder Builder(&I);
4303 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4304
4305 // Calculate costs of old and new ops.
4306 Type *ElemTy = OldLoadTy->getElementType();
4307 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4308 Value *PtrOp = OldLoad->getPointerOperand();
4309
4311 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4312 OldLoad->getPointerAddressSpace(), CostKind);
4313 InstructionCost NewCost =
4314 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4315 OldLoad->getPointerAddressSpace(), CostKind);
4316
4317 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4319 unsigned const MaxIndex = NewNumElements * 2u;
4320
4321 for (llvm::Use &Use : I.uses()) {
4322 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4323 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4324
4325 // Create entry for new use.
4326 NewUses.push_back({Shuffle, OldMask});
4327
4328 // Validate mask indices.
4329 for (int Index : OldMask) {
4330 if (Index >= static_cast<int>(MaxIndex))
4331 return false;
4332 }
4333
4334 // Update costs.
4335 OldCost +=
4337 OldLoadTy, OldMask, CostKind);
4338 NewCost +=
4340 NewLoadTy, OldMask, CostKind);
4341 }
4342
4343 LLVM_DEBUG(
4344 dbgs() << "Found a load used only by shufflevector instructions: "
4345 << I << "\n OldCost: " << OldCost
4346 << " vs NewCost: " << NewCost << "\n");
4347
4348 if (OldCost < NewCost || !NewCost.isValid())
4349 return false;
4350
4351 // Create new load of smaller vector.
4352 auto *NewLoad = cast<LoadInst>(
4353 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4354 NewLoad->copyMetadata(I);
4355
4356 // Replace all uses.
4357 for (UseEntry &Use : NewUses) {
4358 ShuffleVectorInst *Shuffle = Use.first;
4359 std::vector<int> &NewMask = Use.second;
4360
4361 Builder.SetInsertPoint(Shuffle);
4362 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4363 Value *NewShuffle = Builder.CreateShuffleVector(
4364 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4365
4366 replaceValue(*Shuffle, *NewShuffle, false);
4367 }
4368
4369 return true;
4370 }
4371 }
4372 return false;
4373}
4374
4375// Attempt to narrow a phi of shufflevector instructions where the two incoming
4376// values have the same operands but different masks. If the two shuffle masks
4377// are offsets of one another we can use one branch to rotate the incoming
4378// vector and perform one larger shuffle after the phi.
4379bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4380 auto *Phi = dyn_cast<PHINode>(&I);
4381 if (!Phi || Phi->getNumIncomingValues() != 2u)
4382 return false;
4383
4384 Value *Op = nullptr;
4385 ArrayRef<int> Mask0;
4386 ArrayRef<int> Mask1;
4387
4388 if (!match(Phi->getOperand(0u),
4389 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4390 !match(Phi->getOperand(1u),
4391 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4392 return false;
4393
4394 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4395
4396 // Ensure result vectors are wider than the argument vector.
4397 auto *InputVT = cast<FixedVectorType>(Op->getType());
4398 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4399 auto const InputNumElements = InputVT->getNumElements();
4400
4401 if (InputNumElements >= ResultVT->getNumElements())
4402 return false;
4403
4404 // Take the difference of the two shuffle masks at each index. Ignore poison
4405 // values at the same index in both masks.
4406 SmallVector<int, 16> NewMask;
4407 NewMask.reserve(Mask0.size());
4408
4409 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4410 if (M0 >= 0 && M1 >= 0)
4411 NewMask.push_back(M0 - M1);
4412 else if (M0 == -1 && M1 == -1)
4413 continue;
4414 else
4415 return false;
4416 }
4417
4418 // Ensure all elements of the new mask are equal. If the difference between
4419 // the incoming mask elements is the same, the two must be constant offsets
4420 // of one another.
4421 if (NewMask.empty() || !all_equal(NewMask))
4422 return false;
4423
4424 // Create new mask using difference of the two incoming masks.
4425 int MaskOffset = NewMask[0u];
4426 unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
4427 NewMask.clear();
4428
4429 for (unsigned I = 0u; I < InputNumElements; ++I) {
4430 NewMask.push_back(Index);
4431 Index = (Index + 1u) % InputNumElements;
4432 }
4433
4434 // Calculate costs for worst cases and compare.
4435 auto const Kind = TTI::SK_PermuteSingleSrc;
4436 auto OldCost =
4437 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4438 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4439 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4440 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4441
4442 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4443 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4444 << "\n");
4445
4446 if (NewCost > OldCost)
4447 return false;
4448
4449 // Create new shuffles and narrowed phi.
4450 auto Builder = IRBuilder(Shuf);
4451 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4452 auto *PoisonVal = PoisonValue::get(InputVT);
4453 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4454 Worklist.push(cast<Instruction>(NewShuf0));
4455
4456 Builder.SetInsertPoint(Phi);
4457 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4458 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4459 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4460 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4461
4462 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4463 PoisonVal = PoisonValue::get(NewPhi->getType());
4464 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4465
4466 replaceValue(*Phi, *NewShuf1);
4467 return true;
4468}
4469
4470/// This is the entry point for all transforms. Pass manager differences are
4471/// handled in the callers of this function.
4472bool VectorCombine::run() {
4474 return false;
4475
4476 // Don't attempt vectorization if the target does not support vectors.
4477 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4478 return false;
4479
4480 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4481
4482 auto FoldInst = [this](Instruction &I) {
4483 Builder.SetInsertPoint(&I);
4484 bool IsVectorType = isa<VectorType>(I.getType());
4485 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4486 auto Opcode = I.getOpcode();
4487
4488 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4489
4490 // These folds should be beneficial regardless of when this pass is run
4491 // in the optimization pipeline.
4492 // The type checking is for run-time efficiency. We can avoid wasting time
4493 // dispatching to folding functions if there's no chance of matching.
4494 if (IsFixedVectorType) {
4495 switch (Opcode) {
4496 case Instruction::InsertElement:
4497 if (vectorizeLoadInsert(I))
4498 return true;
4499 break;
4500 case Instruction::ShuffleVector:
4501 if (widenSubvectorLoad(I))
4502 return true;
4503 break;
4504 default:
4505 break;
4506 }
4507 }
4508
4509 // This transform works with scalable and fixed vectors
4510 // TODO: Identify and allow other scalable transforms
4511 if (IsVectorType) {
4512 if (scalarizeOpOrCmp(I))
4513 return true;
4514 if (scalarizeLoadExtract(I))
4515 return true;
4516 if (scalarizeExtExtract(I))
4517 return true;
4518 if (scalarizeVPIntrinsic(I))
4519 return true;
4520 if (foldInterleaveIntrinsics(I))
4521 return true;
4522 }
4523
4524 if (Opcode == Instruction::Store)
4525 if (foldSingleElementStore(I))
4526 return true;
4527
4528 // If this is an early pipeline invocation of this pass, we are done.
4529 if (TryEarlyFoldsOnly)
4530 return false;
4531
4532 // Otherwise, try folds that improve codegen but may interfere with
4533 // early IR canonicalizations.
4534 // The type checking is for run-time efficiency. We can avoid wasting time
4535 // dispatching to folding functions if there's no chance of matching.
4536 if (IsFixedVectorType) {
4537 switch (Opcode) {
4538 case Instruction::InsertElement:
4539 if (foldInsExtFNeg(I))
4540 return true;
4541 if (foldInsExtBinop(I))
4542 return true;
4543 if (foldInsExtVectorToShuffle(I))
4544 return true;
4545 break;
4546 case Instruction::ShuffleVector:
4547 if (foldPermuteOfBinops(I))
4548 return true;
4549 if (foldShuffleOfBinops(I))
4550 return true;
4551 if (foldShuffleOfSelects(I))
4552 return true;
4553 if (foldShuffleOfCastops(I))
4554 return true;
4555 if (foldShuffleOfShuffles(I))
4556 return true;
4557 if (foldShuffleOfIntrinsics(I))
4558 return true;
4559 if (foldSelectShuffle(I))
4560 return true;
4561 if (foldShuffleToIdentity(I))
4562 return true;
4563 break;
4564 case Instruction::Load:
4565 if (shrinkLoadForShuffles(I))
4566 return true;
4567 break;
4568 case Instruction::BitCast:
4569 if (foldBitcastShuffle(I))
4570 return true;
4571 break;
4572 case Instruction::And:
4573 case Instruction::Or:
4574 case Instruction::Xor:
4575 if (foldBitOpOfCastops(I))
4576 return true;
4577 if (foldBitOpOfCastConstant(I))
4578 return true;
4579 break;
4580 case Instruction::PHI:
4581 if (shrinkPhiOfShuffles(I))
4582 return true;
4583 break;
4584 default:
4585 if (shrinkType(I))
4586 return true;
4587 break;
4588 }
4589 } else {
4590 switch (Opcode) {
4591 case Instruction::Call:
4592 if (foldShuffleFromReductions(I))
4593 return true;
4594 if (foldCastFromReductions(I))
4595 return true;
4596 break;
4597 case Instruction::ExtractElement:
4598 if (foldShuffleChainsToReduce(I))
4599 return true;
4600 break;
4601 case Instruction::ICmp:
4602 case Instruction::FCmp:
4603 if (foldExtractExtract(I))
4604 return true;
4605 break;
4606 case Instruction::Or:
4607 if (foldConcatOfBoolMasks(I))
4608 return true;
4609 [[fallthrough]];
4610 default:
4611 if (Instruction::isBinaryOp(Opcode)) {
4612 if (foldExtractExtract(I))
4613 return true;
4614 if (foldExtractedCmps(I))
4615 return true;
4616 if (foldBinopOfReductions(I))
4617 return true;
4618 }
4619 break;
4620 }
4621 }
4622 return false;
4623 };
4624
4625 bool MadeChange = false;
4626 for (BasicBlock &BB : F) {
4627 // Ignore unreachable basic blocks.
4628 if (!DT.isReachableFromEntry(&BB))
4629 continue;
4630 // Use early increment range so that we can erase instructions in loop.
4631 // make_early_inc_range is not applicable here, as the next iterator may
4632 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4633 // We manually maintain the next instruction and update it when it is about
4634 // to be deleted.
4635 Instruction *I = &BB.front();
4636 while (I) {
4637 NextInst = I->getNextNode();
4638 if (!I->isDebugOrPseudoInst())
4639 MadeChange |= FoldInst(*I);
4640 I = NextInst;
4641 }
4642 }
4643
4644 NextInst = nullptr;
4645
4646 while (!Worklist.isEmpty()) {
4647 Instruction *I = Worklist.removeOne();
4648 if (!I)
4649 continue;
4650
4653 continue;
4654 }
4655
4656 MadeChange |= FoldInst(*I);
4657 }
4658
4659 return MadeChange;
4660}
4661
4664 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4666 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4667 AAResults &AA = FAM.getResult<AAManager>(F);
4668 const DataLayout *DL = &F.getDataLayout();
4669 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4670 TryEarlyFoldsOnly);
4671 if (!Combiner.run())
4672 return PreservedAnalyses::all();
4675 return PA;
4676}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1451
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
bool isFPPredicate() const
Definition InstrTypes.h:784
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2559
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1864
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2637
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2238
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1931
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2263
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2494
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1860
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1795
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:359
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:843
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2060
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1720
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:355
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:416
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1652
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:431
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2110
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:289
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:138