LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
76 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
77 const DataLayout *DL, TTI::TargetCostKind CostKind,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
89 const DominatorTree &DT;
90 AAResults &AA;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitcastShuffle(Instruction &I);
126 bool scalarizeOpOrCmp(Instruction &I);
127 bool scalarizeVPIntrinsic(Instruction &I);
128 bool foldExtractedCmps(Instruction &I);
129 bool foldBinopOfReductions(Instruction &I);
130 bool foldSingleElementStore(Instruction &I);
131 bool scalarizeLoadExtract(Instruction &I);
132 bool scalarizeExtExtract(Instruction &I);
133 bool foldConcatOfBoolMasks(Instruction &I);
134 bool foldPermuteOfBinops(Instruction &I);
135 bool foldShuffleOfBinops(Instruction &I);
136 bool foldShuffleOfSelects(Instruction &I);
137 bool foldShuffleOfCastops(Instruction &I);
138 bool foldShuffleOfShuffles(Instruction &I);
139 bool foldShuffleOfIntrinsics(Instruction &I);
140 bool foldShuffleToIdentity(Instruction &I);
141 bool foldShuffleFromReductions(Instruction &I);
142 bool foldShuffleChainsToReduce(Instruction &I);
143 bool foldCastFromReductions(Instruction &I);
144 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
145 bool foldInterleaveIntrinsics(Instruction &I);
146 bool shrinkType(Instruction &I);
147 bool shrinkLoadForShuffles(Instruction &I);
148 bool shrinkPhiOfShuffles(Instruction &I);
149
150 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
151 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
152 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
153 Old.replaceAllUsesWith(&New);
154 if (auto *NewI = dyn_cast<Instruction>(&New)) {
155 New.takeName(&Old);
156 Worklist.pushUsersToWorkList(*NewI);
157 Worklist.pushValue(NewI);
158 }
159 if (Erase && isInstructionTriviallyDead(&Old)) {
160 eraseInstruction(Old);
161 } else {
162 Worklist.push(&Old);
163 }
164 }
165
167 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
168 SmallVector<Value *> Ops(I.operands());
169 Worklist.remove(&I);
170 I.eraseFromParent();
171
172 // Push remaining users of the operands and then the operand itself - allows
173 // further folds that were hindered by OneUse limits.
175 for (Value *Op : Ops) {
176 if (!Visited.contains(Op)) {
177 if (auto *OpI = dyn_cast<Instruction>(Op)) {
179 OpI, nullptr, nullptr, [&](Value *V) {
180 if (auto *I = dyn_cast<Instruction>(V)) {
181 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
182 Worklist.remove(I);
183 if (I == NextInst)
184 NextInst = NextInst->getNextNode();
185 Visited.insert(I);
186 }
187 }))
188 continue;
189 Worklist.pushUsersToWorkList(*OpI);
190 Worklist.pushValue(OpI);
191 }
192 }
193 }
194 }
195};
196} // namespace
197
198/// Return the source operand of a potentially bitcasted value. If there is no
199/// bitcast, return the input value itself.
201 while (auto *BitCast = dyn_cast<BitCastInst>(V))
202 V = BitCast->getOperand(0);
203 return V;
204}
205
206static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
207 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
208 // The widened load may load data from dirty regions or create data races
209 // non-existent in the source.
210 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
211 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
213 return false;
214
215 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
216 // sure we have all of our type-based constraints in place for this target.
217 Type *ScalarTy = Load->getType()->getScalarType();
218 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
219 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
220 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
221 ScalarSize % 8 != 0)
222 return false;
223
224 return true;
225}
226
227bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
228 // Match insert into fixed vector of scalar value.
229 // TODO: Handle non-zero insert index.
230 Value *Scalar;
231 if (!match(&I,
233 return false;
234
235 // Optionally match an extract from another vector.
236 Value *X;
237 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
238 if (!HasExtract)
239 X = Scalar;
240
241 auto *Load = dyn_cast<LoadInst>(X);
242 if (!canWidenLoad(Load, TTI))
243 return false;
244
245 Type *ScalarTy = Scalar->getType();
246 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
247 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
248
249 // Check safety of replacing the scalar load with a larger vector load.
250 // We use minimal alignment (maximum flexibility) because we only care about
251 // the dereferenceable region. When calculating cost and creating a new op,
252 // we may use a larger value based on alignment attributes.
253 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
254 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
255
256 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
257 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
258 unsigned OffsetEltIndex = 0;
259 Align Alignment = Load->getAlign();
260 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
261 &DT)) {
262 // It is not safe to load directly from the pointer, but we can still peek
263 // through gep offsets and check if it safe to load from a base address with
264 // updated alignment. If it is, we can shuffle the element(s) into place
265 // after loading.
266 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
267 APInt Offset(OffsetBitWidth, 0);
269
270 // We want to shuffle the result down from a high element of a vector, so
271 // the offset must be positive.
272 if (Offset.isNegative())
273 return false;
274
275 // The offset must be a multiple of the scalar element to shuffle cleanly
276 // in the element's size.
277 uint64_t ScalarSizeInBytes = ScalarSize / 8;
278 if (Offset.urem(ScalarSizeInBytes) != 0)
279 return false;
280
281 // If we load MinVecNumElts, will our target element still be loaded?
282 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
283 if (OffsetEltIndex >= MinVecNumElts)
284 return false;
285
286 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
287 &DT))
288 return false;
289
290 // Update alignment with offset value. Note that the offset could be negated
291 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
292 // negation does not change the result of the alignment calculation.
293 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
294 }
295
296 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
297 // Use the greater of the alignment on the load or its source pointer.
298 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
299 Type *LoadTy = Load->getType();
300 unsigned AS = Load->getPointerAddressSpace();
301 InstructionCost OldCost =
302 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
303 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
304 OldCost +=
305 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
306 /* Insert */ true, HasExtract, CostKind);
307
308 // New pattern: load VecPtr
309 InstructionCost NewCost =
310 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
311 // Optionally, we are shuffling the loaded vector element(s) into place.
312 // For the mask set everything but element 0 to undef to prevent poison from
313 // propagating from the extra loaded memory. This will also optionally
314 // shrink/grow the vector from the loaded size to the output size.
315 // We assume this operation has no cost in codegen if there was no offset.
316 // Note that we could use freeze to avoid poison problems, but then we might
317 // still need a shuffle to change the vector size.
318 auto *Ty = cast<FixedVectorType>(I.getType());
319 unsigned OutputNumElts = Ty->getNumElements();
321 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
322 Mask[0] = OffsetEltIndex;
323 if (OffsetEltIndex)
324 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
325 CostKind);
326
327 // We can aggressively convert to the vector form because the backend can
328 // invert this transform if it does not result in a performance win.
329 if (OldCost < NewCost || !NewCost.isValid())
330 return false;
331
332 // It is safe and potentially profitable to load a vector directly:
333 // inselt undef, load Scalar, 0 --> load VecPtr
334 IRBuilder<> Builder(Load);
335 Value *CastedPtr =
336 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
337 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
338 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
339
340 replaceValue(I, *VecLd);
341 ++NumVecLoad;
342 return true;
343}
344
345/// If we are loading a vector and then inserting it into a larger vector with
346/// undefined elements, try to load the larger vector and eliminate the insert.
347/// This removes a shuffle in IR and may allow combining of other loaded values.
348bool VectorCombine::widenSubvectorLoad(Instruction &I) {
349 // Match subvector insert of fixed vector.
350 auto *Shuf = cast<ShuffleVectorInst>(&I);
351 if (!Shuf->isIdentityWithPadding())
352 return false;
353
354 // Allow a non-canonical shuffle mask that is choosing elements from op1.
355 unsigned NumOpElts =
356 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
357 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
358 return M >= (int)(NumOpElts);
359 });
360
361 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
362 if (!canWidenLoad(Load, TTI))
363 return false;
364
365 // We use minimal alignment (maximum flexibility) because we only care about
366 // the dereferenceable region. When calculating cost and creating a new op,
367 // we may use a larger value based on alignment attributes.
368 auto *Ty = cast<FixedVectorType>(I.getType());
369 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
370 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
371 Align Alignment = Load->getAlign();
372 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
373 return false;
374
375 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
376 Type *LoadTy = Load->getType();
377 unsigned AS = Load->getPointerAddressSpace();
378
379 // Original pattern: insert_subvector (load PtrOp)
380 // This conservatively assumes that the cost of a subvector insert into an
381 // undef value is 0. We could add that cost if the cost model accurately
382 // reflects the real cost of that operation.
383 InstructionCost OldCost =
384 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
385
386 // New pattern: load PtrOp
387 InstructionCost NewCost =
388 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
389
390 // We can aggressively convert to the vector form because the backend can
391 // invert this transform if it does not result in a performance win.
392 if (OldCost < NewCost || !NewCost.isValid())
393 return false;
394
395 IRBuilder<> Builder(Load);
396 Value *CastedPtr =
397 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
398 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
399 replaceValue(I, *VecLd);
400 ++NumVecLoad;
401 return true;
402}
403
404/// Determine which, if any, of the inputs should be replaced by a shuffle
405/// followed by extract from a different index.
406ExtractElementInst *VectorCombine::getShuffleExtract(
408 unsigned PreferredExtractIndex = InvalidIndex) const {
409 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
410 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
411 assert(Index0C && Index1C && "Expected constant extract indexes");
412
413 unsigned Index0 = Index0C->getZExtValue();
414 unsigned Index1 = Index1C->getZExtValue();
415
416 // If the extract indexes are identical, no shuffle is needed.
417 if (Index0 == Index1)
418 return nullptr;
419
420 Type *VecTy = Ext0->getVectorOperand()->getType();
421 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
422 InstructionCost Cost0 =
423 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
424 InstructionCost Cost1 =
425 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
426
427 // If both costs are invalid no shuffle is needed
428 if (!Cost0.isValid() && !Cost1.isValid())
429 return nullptr;
430
431 // We are extracting from 2 different indexes, so one operand must be shuffled
432 // before performing a vector operation and/or extract. The more expensive
433 // extract will be replaced by a shuffle.
434 if (Cost0 > Cost1)
435 return Ext0;
436 if (Cost1 > Cost0)
437 return Ext1;
438
439 // If the costs are equal and there is a preferred extract index, shuffle the
440 // opposite operand.
441 if (PreferredExtractIndex == Index0)
442 return Ext1;
443 if (PreferredExtractIndex == Index1)
444 return Ext0;
445
446 // Otherwise, replace the extract with the higher index.
447 return Index0 > Index1 ? Ext0 : Ext1;
448}
449
450/// Compare the relative costs of 2 extracts followed by scalar operation vs.
451/// vector operation(s) followed by extract. Return true if the existing
452/// instructions are cheaper than a vector alternative. Otherwise, return false
453/// and if one of the extracts should be transformed to a shufflevector, set
454/// \p ConvertToShuffle to that extract instruction.
455bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
456 ExtractElementInst *Ext1,
457 const Instruction &I,
458 ExtractElementInst *&ConvertToShuffle,
459 unsigned PreferredExtractIndex) {
460 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
461 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
462 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
463
464 unsigned Opcode = I.getOpcode();
465 Value *Ext0Src = Ext0->getVectorOperand();
466 Value *Ext1Src = Ext1->getVectorOperand();
467 Type *ScalarTy = Ext0->getType();
468 auto *VecTy = cast<VectorType>(Ext0Src->getType());
469 InstructionCost ScalarOpCost, VectorOpCost;
470
471 // Get cost estimates for scalar and vector versions of the operation.
472 bool IsBinOp = Instruction::isBinaryOp(Opcode);
473 if (IsBinOp) {
474 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
475 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
476 } else {
477 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
478 "Expected a compare");
479 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
480 ScalarOpCost = TTI.getCmpSelInstrCost(
481 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
482 VectorOpCost = TTI.getCmpSelInstrCost(
483 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
484 }
485
486 // Get cost estimates for the extract elements. These costs will factor into
487 // both sequences.
488 unsigned Ext0Index = Ext0IndexC->getZExtValue();
489 unsigned Ext1Index = Ext1IndexC->getZExtValue();
490
491 InstructionCost Extract0Cost =
492 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
493 InstructionCost Extract1Cost =
494 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
495
496 // A more expensive extract will always be replaced by a splat shuffle.
497 // For example, if Ext0 is more expensive:
498 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
499 // extelt (opcode (splat V0, Ext0), V1), Ext1
500 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
501 // check the cost of creating a broadcast shuffle and shuffling both
502 // operands to element 0.
503 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
504 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
505 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
506
507 // Extra uses of the extracts mean that we include those costs in the
508 // vector total because those instructions will not be eliminated.
509 InstructionCost OldCost, NewCost;
510 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
511 // Handle a special case. If the 2 extracts are identical, adjust the
512 // formulas to account for that. The extra use charge allows for either the
513 // CSE'd pattern or an unoptimized form with identical values:
514 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
515 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
516 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
517 OldCost = CheapExtractCost + ScalarOpCost;
518 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
519 } else {
520 // Handle the general case. Each extract is actually a different value:
521 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
522 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
523 NewCost = VectorOpCost + CheapExtractCost +
524 !Ext0->hasOneUse() * Extract0Cost +
525 !Ext1->hasOneUse() * Extract1Cost;
526 }
527
528 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
529 if (ConvertToShuffle) {
530 if (IsBinOp && DisableBinopExtractShuffle)
531 return true;
532
533 // If we are extracting from 2 different indexes, then one operand must be
534 // shuffled before performing the vector operation. The shuffle mask is
535 // poison except for 1 lane that is being translated to the remaining
536 // extraction lane. Therefore, it is a splat shuffle. Ex:
537 // ShufMask = { poison, poison, 0, poison }
538 // TODO: The cost model has an option for a "broadcast" shuffle
539 // (splat-from-element-0), but no option for a more general splat.
540 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
541 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
543 ShuffleMask[BestInsIndex] = BestExtIndex;
545 VecTy, VecTy, ShuffleMask, CostKind, 0,
546 nullptr, {ConvertToShuffle});
547 } else {
549 VecTy, VecTy, {}, CostKind, 0, nullptr,
550 {ConvertToShuffle});
551 }
552 }
553
554 // Aggressively form a vector op if the cost is equal because the transform
555 // may enable further optimization.
556 // Codegen can reverse this transform (scalarize) if it was not profitable.
557 return OldCost < NewCost;
558}
559
560/// Create a shuffle that translates (shifts) 1 element from the input vector
561/// to a new element location.
562static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
563 unsigned NewIndex, IRBuilderBase &Builder) {
564 // The shuffle mask is poison except for 1 lane that is being translated
565 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
566 // ShufMask = { 2, poison, poison, poison }
567 auto *VecTy = cast<FixedVectorType>(Vec->getType());
568 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
569 ShufMask[NewIndex] = OldIndex;
570 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
571}
572
573/// Given an extract element instruction with constant index operand, shuffle
574/// the source vector (shift the scalar element) to a NewIndex for extraction.
575/// Return null if the input can be constant folded, so that we are not creating
576/// unnecessary instructions.
577static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
578 IRBuilderBase &Builder) {
579 // Shufflevectors can only be created for fixed-width vectors.
580 Value *X = ExtElt->getVectorOperand();
581 if (!isa<FixedVectorType>(X->getType()))
582 return nullptr;
583
584 // If the extract can be constant-folded, this code is unsimplified. Defer
585 // to other passes to handle that.
586 Value *C = ExtElt->getIndexOperand();
587 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
588 if (isa<Constant>(X))
589 return nullptr;
590
591 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
592 NewIndex, Builder);
593 return Shuf;
594}
595
596/// Try to reduce extract element costs by converting scalar compares to vector
597/// compares followed by extract.
598/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
599Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
600 Instruction &I) {
601 assert(isa<CmpInst>(&I) && "Expected a compare");
602
603 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
604 // --> extelt (cmp Pred V0, V1), ExtIndex
605 ++NumVecCmp;
606 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
607 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
608 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
609}
610
611/// Try to reduce extract element costs by converting scalar binops to vector
612/// binops followed by extract.
613/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
614Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
615 Instruction &I) {
616 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
617
618 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
619 // --> extelt (bo V0, V1), ExtIndex
620 ++NumVecBO;
621 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
622 V1, "foldExtExtBinop");
623
624 // All IR flags are safe to back-propagate because any potential poison
625 // created in unused vector elements is discarded by the extract.
626 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
627 VecBOInst->copyIRFlags(&I);
628
629 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
630}
631
632/// Match an instruction with extracted vector operands.
633bool VectorCombine::foldExtractExtract(Instruction &I) {
634 // It is not safe to transform things like div, urem, etc. because we may
635 // create undefined behavior when executing those on unknown vector elements.
637 return false;
638
639 Instruction *I0, *I1;
641 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
643 return false;
644
645 Value *V0, *V1;
646 uint64_t C0, C1;
647 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
648 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
649 V0->getType() != V1->getType())
650 return false;
651
652 // If the scalar value 'I' is going to be re-inserted into a vector, then try
653 // to create an extract to that same element. The extract/insert can be
654 // reduced to a "select shuffle".
655 // TODO: If we add a larger pattern match that starts from an insert, this
656 // probably becomes unnecessary.
657 auto *Ext0 = cast<ExtractElementInst>(I0);
658 auto *Ext1 = cast<ExtractElementInst>(I1);
659 uint64_t InsertIndex = InvalidIndex;
660 if (I.hasOneUse())
661 match(I.user_back(),
662 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
663
664 ExtractElementInst *ExtractToChange;
665 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
666 return false;
667
668 Value *ExtOp0 = Ext0->getVectorOperand();
669 Value *ExtOp1 = Ext1->getVectorOperand();
670
671 if (ExtractToChange) {
672 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
673 Value *NewExtOp =
674 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
675 if (!NewExtOp)
676 return false;
677 if (ExtractToChange == Ext0)
678 ExtOp0 = NewExtOp;
679 else
680 ExtOp1 = NewExtOp;
681 }
682
683 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
684 : Ext0->getIndexOperand();
685 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
686 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
687 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
688 Worklist.push(Ext0);
689 Worklist.push(Ext1);
690 replaceValue(I, *NewExt);
691 return true;
692}
693
694/// Try to replace an extract + scalar fneg + insert with a vector fneg +
695/// shuffle.
696bool VectorCombine::foldInsExtFNeg(Instruction &I) {
697 // Match an insert (op (extract)) pattern.
698 Value *DestVec;
700 Instruction *FNeg;
701 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
702 m_ConstantInt(Index))))
703 return false;
704
705 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
706 Value *SrcVec;
707 Instruction *Extract;
708 if (!match(FNeg, m_FNeg(m_CombineAnd(
709 m_Instruction(Extract),
710 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
711 return false;
712
713 auto *VecTy = cast<FixedVectorType>(I.getType());
714 auto *ScalarTy = VecTy->getScalarType();
715 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
716 if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
717 return false;
718
719 // Ignore bogus insert/extract index.
720 unsigned NumElts = VecTy->getNumElements();
721 if (Index >= NumElts)
722 return false;
723
724 // We are inserting the negated element into the same lane that we extracted
725 // from. This is equivalent to a select-shuffle that chooses all but the
726 // negated element from the destination vector.
727 SmallVector<int> Mask(NumElts);
728 std::iota(Mask.begin(), Mask.end(), 0);
729 Mask[Index] = Index + NumElts;
730 InstructionCost OldCost =
731 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
732 TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
733
734 // If the extract has one use, it will be eliminated, so count it in the
735 // original cost. If it has more than one use, ignore the cost because it will
736 // be the same before/after.
737 if (Extract->hasOneUse())
738 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
739
740 InstructionCost NewCost =
741 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
743 Mask, CostKind);
744
745 bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
746 // If the lengths of the two vectors are not equal,
747 // we need to add a length-change vector. Add this cost.
748 SmallVector<int> SrcMask;
749 if (NeedLenChg) {
750 SrcMask.assign(NumElts, PoisonMaskElem);
751 SrcMask[Index] = Index;
753 VecTy, SrcVecTy, SrcMask, CostKind);
754 }
755
756 if (NewCost > OldCost)
757 return false;
758
759 Value *NewShuf;
760 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
761 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
762 if (NeedLenChg) {
763 // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
764 Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
765 NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
766 } else {
767 // shuffle DestVec, (fneg SrcVec), Mask
768 NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
769 }
770
771 replaceValue(I, *NewShuf);
772 return true;
773}
774
775/// Try to fold insert(binop(x,y),binop(a,b),idx)
776/// --> binop(insert(x,a,idx),insert(y,b,idx))
777bool VectorCombine::foldInsExtBinop(Instruction &I) {
778 BinaryOperator *VecBinOp, *SclBinOp;
780 if (!match(&I,
781 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
782 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
783 return false;
784
785 // TODO: Add support for addlike etc.
786 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
787 if (BinOpcode != SclBinOp->getOpcode())
788 return false;
789
790 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
791 if (!ResultTy)
792 return false;
793
794 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
795 // shuffle?
796
798 TTI.getInstructionCost(VecBinOp, CostKind) +
800 InstructionCost NewCost =
801 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
802 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
803 Index, VecBinOp->getOperand(0),
804 SclBinOp->getOperand(0)) +
805 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
806 Index, VecBinOp->getOperand(1),
807 SclBinOp->getOperand(1));
808
809 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
810 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
811 << "\n");
812 if (NewCost > OldCost)
813 return false;
814
815 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
816 SclBinOp->getOperand(0), Index);
817 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
818 SclBinOp->getOperand(1), Index);
819 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
820
821 // Intersect flags from the old binops.
822 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
823 NewInst->copyIRFlags(VecBinOp);
824 NewInst->andIRFlags(SclBinOp);
825 }
826
827 Worklist.pushValue(NewIns0);
828 Worklist.pushValue(NewIns1);
829 replaceValue(I, *NewBO);
830 return true;
831}
832
833/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
834/// Supports: bitcast, trunc, sext, zext
835bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
836 // Check if this is a bitwise logic operation
837 auto *BinOp = dyn_cast<BinaryOperator>(&I);
838 if (!BinOp || !BinOp->isBitwiseLogicOp())
839 return false;
840
841 // Get the cast instructions
842 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
843 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
844 if (!LHSCast || !RHSCast) {
845 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
846 return false;
847 }
848
849 // Both casts must be the same type
850 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
851 if (CastOpcode != RHSCast->getOpcode())
852 return false;
853
854 // Only handle supported cast operations
855 switch (CastOpcode) {
856 case Instruction::BitCast:
857 case Instruction::Trunc:
858 case Instruction::SExt:
859 case Instruction::ZExt:
860 break;
861 default:
862 return false;
863 }
864
865 Value *LHSSrc = LHSCast->getOperand(0);
866 Value *RHSSrc = RHSCast->getOperand(0);
867
868 // Source types must match
869 if (LHSSrc->getType() != RHSSrc->getType())
870 return false;
871
872 // Only handle vector types with integer elements
873 auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
874 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
875 if (!SrcVecTy || !DstVecTy)
876 return false;
877
878 if (!SrcVecTy->getScalarType()->isIntegerTy() ||
879 !DstVecTy->getScalarType()->isIntegerTy())
880 return false;
881
882 // Cost Check :
883 // OldCost = bitlogic + 2*casts
884 // NewCost = bitlogic + cast
885
886 // Calculate specific costs for each cast with instruction context
887 InstructionCost LHSCastCost =
888 TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
890 InstructionCost RHSCastCost =
891 TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
893
894 InstructionCost OldCost =
895 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
896 LHSCastCost + RHSCastCost;
897
898 // For new cost, we can't provide an instruction (it doesn't exist yet)
899 InstructionCost GenericCastCost = TTI.getCastInstrCost(
900 CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
901
902 InstructionCost NewCost =
903 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
904 GenericCastCost;
905
906 // Account for multi-use casts using specific costs
907 if (!LHSCast->hasOneUse())
908 NewCost += LHSCastCost;
909 if (!RHSCast->hasOneUse())
910 NewCost += RHSCastCost;
911
912 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
913 << " NewCost=" << NewCost << "\n");
914
915 if (NewCost > OldCost)
916 return false;
917
918 // Create the operation on the source type
919 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
920 BinOp->getName() + ".inner");
921 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
922 NewBinOp->copyIRFlags(BinOp);
923
924 Worklist.pushValue(NewOp);
925
926 // Create the cast operation directly to ensure we get a new instruction
927 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
928
929 // Preserve cast instruction flags
930 NewCast->copyIRFlags(LHSCast);
931 NewCast->andIRFlags(RHSCast);
932
933 // Insert the new instruction
934 Value *Result = Builder.Insert(NewCast);
935
936 replaceValue(I, *Result);
937 return true;
938}
939
940/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
941/// destination type followed by shuffle. This can enable further transforms by
942/// moving bitcasts or shuffles together.
943bool VectorCombine::foldBitcastShuffle(Instruction &I) {
944 Value *V0, *V1;
946 if (!match(&I, m_BitCast(m_OneUse(
947 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
948 return false;
949
950 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
951 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
952 // mask for scalable type is a splat or not.
953 // 2) Disallow non-vector casts.
954 // TODO: We could allow any shuffle.
955 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
956 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
957 if (!DestTy || !SrcTy)
958 return false;
959
960 unsigned DestEltSize = DestTy->getScalarSizeInBits();
961 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
962 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
963 return false;
964
965 bool IsUnary = isa<UndefValue>(V1);
966
967 // For binary shuffles, only fold bitcast(shuffle(X,Y))
968 // if it won't increase the number of bitcasts.
969 if (!IsUnary) {
970 auto *BCTy0 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V0)->getType());
971 auto *BCTy1 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V1)->getType());
972 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
973 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
974 return false;
975 }
976
977 SmallVector<int, 16> NewMask;
978 if (DestEltSize <= SrcEltSize) {
979 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
980 // always be expanded to the equivalent form choosing narrower elements.
981 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
982 unsigned ScaleFactor = SrcEltSize / DestEltSize;
983 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
984 } else {
985 // The bitcast is from narrow elements to wide elements. The shuffle mask
986 // must choose consecutive elements to allow casting first.
987 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
988 unsigned ScaleFactor = DestEltSize / SrcEltSize;
989 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
990 return false;
991 }
992
993 // Bitcast the shuffle src - keep its original width but using the destination
994 // scalar type.
995 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
996 auto *NewShuffleTy =
997 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
998 auto *OldShuffleTy =
999 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1000 unsigned NumOps = IsUnary ? 1 : 2;
1001
1002 // The new shuffle must not cost more than the old shuffle.
1006
1007 InstructionCost NewCost =
1008 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1009 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1010 TargetTransformInfo::CastContextHint::None,
1011 CostKind));
1012 InstructionCost OldCost =
1013 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1014 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1015 TargetTransformInfo::CastContextHint::None,
1016 CostKind);
1017
1018 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1019 << OldCost << " vs NewCost: " << NewCost << "\n");
1020
1021 if (NewCost > OldCost || !NewCost.isValid())
1022 return false;
1023
1024 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1025 ++NumShufOfBitcast;
1026 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1027 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1028 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1029 replaceValue(I, *Shuf);
1030 return true;
1031}
1032
1033/// VP Intrinsics whose vector operands are both splat values may be simplified
1034/// into the scalar version of the operation and the result splatted. This
1035/// can lead to scalarization down the line.
1036bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1037 if (!isa<VPIntrinsic>(I))
1038 return false;
1039 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1040 Value *Op0 = VPI.getArgOperand(0);
1041 Value *Op1 = VPI.getArgOperand(1);
1042
1043 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1044 return false;
1045
1046 // Check getSplatValue early in this function, to avoid doing unnecessary
1047 // work.
1048 Value *ScalarOp0 = getSplatValue(Op0);
1049 Value *ScalarOp1 = getSplatValue(Op1);
1050 if (!ScalarOp0 || !ScalarOp1)
1051 return false;
1052
1053 // For the binary VP intrinsics supported here, the result on disabled lanes
1054 // is a poison value. For now, only do this simplification if all lanes
1055 // are active.
1056 // TODO: Relax the condition that all lanes are active by using insertelement
1057 // on inactive lanes.
1058 auto IsAllTrueMask = [](Value *MaskVal) {
1059 if (Value *SplattedVal = getSplatValue(MaskVal))
1060 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1061 return ConstValue->isAllOnesValue();
1062 return false;
1063 };
1064 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1065 return false;
1066
1067 // Check to make sure we support scalarization of the intrinsic
1068 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1069 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1070 return false;
1071
1072 // Calculate cost of splatting both operands into vectors and the vector
1073 // intrinsic
1074 VectorType *VecTy = cast<VectorType>(VPI.getType());
1076 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1077 Mask.resize(FVTy->getNumElements(), 0);
1078 InstructionCost SplatCost =
1079 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1081 CostKind);
1082
1083 // Calculate the cost of the VP Intrinsic
1085 for (Value *V : VPI.args())
1086 Args.push_back(V->getType());
1087 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1088 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1089 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1090
1091 // Determine scalar opcode
1092 std::optional<unsigned> FunctionalOpcode =
1093 VPI.getFunctionalOpcode();
1094 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1095 if (!FunctionalOpcode) {
1096 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1097 if (!ScalarIntrID)
1098 return false;
1099 }
1100
1101 // Calculate cost of scalarizing
1102 InstructionCost ScalarOpCost = 0;
1103 if (ScalarIntrID) {
1104 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1105 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1106 } else {
1107 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1108 VecTy->getScalarType(), CostKind);
1109 }
1110
1111 // The existing splats may be kept around if other instructions use them.
1112 InstructionCost CostToKeepSplats =
1113 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1114 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1115
1116 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1117 << "\n");
1118 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1119 << ", Cost of scalarizing:" << NewCost << "\n");
1120
1121 // We want to scalarize unless the vector variant actually has lower cost.
1122 if (OldCost < NewCost || !NewCost.isValid())
1123 return false;
1124
1125 // Scalarize the intrinsic
1126 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1127 Value *EVL = VPI.getArgOperand(3);
1128
1129 // If the VP op might introduce UB or poison, we can scalarize it provided
1130 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1131 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1132 // scalarizing it.
1133 bool SafeToSpeculate;
1134 if (ScalarIntrID)
1135 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1136 .hasAttribute(Attribute::AttrKind::Speculatable);
1137 else
1139 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1140 if (!SafeToSpeculate &&
1141 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1142 return false;
1143
1144 Value *ScalarVal =
1145 ScalarIntrID
1146 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1147 {ScalarOp0, ScalarOp1})
1148 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1149 ScalarOp0, ScalarOp1);
1150
1151 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1152 return true;
1153}
1154
1155/// Match a vector op/compare/intrinsic with at least one
1156/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1157/// by insertelement.
1158bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1159 auto *UO = dyn_cast<UnaryOperator>(&I);
1160 auto *BO = dyn_cast<BinaryOperator>(&I);
1161 auto *CI = dyn_cast<CmpInst>(&I);
1162 auto *II = dyn_cast<IntrinsicInst>(&I);
1163 if (!UO && !BO && !CI && !II)
1164 return false;
1165
1166 // TODO: Allow intrinsics with different argument types
1167 if (II) {
1168 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1169 return false;
1170 for (auto [Idx, Arg] : enumerate(II->args()))
1171 if (Arg->getType() != II->getType() &&
1172 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1173 return false;
1174 }
1175
1176 // Do not convert the vector condition of a vector select into a scalar
1177 // condition. That may cause problems for codegen because of differences in
1178 // boolean formats and register-file transfers.
1179 // TODO: Can we account for that in the cost model?
1180 if (CI)
1181 for (User *U : I.users())
1182 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1183 return false;
1184
1185 // Match constant vectors or scalars being inserted into constant vectors:
1186 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1187 SmallVector<Value *> VecCs, ScalarOps;
1188 std::optional<uint64_t> Index;
1189
1190 auto Ops = II ? II->args() : I.operands();
1191 for (auto [OpNum, Op] : enumerate(Ops)) {
1192 Constant *VecC;
1193 Value *V;
1194 uint64_t InsIdx = 0;
1195 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1196 m_ConstantInt(InsIdx)))) {
1197 // Bail if any inserts are out of bounds.
1198 VectorType *OpTy = cast<VectorType>(Op->getType());
1199 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1200 return false;
1201 // All inserts must have the same index.
1202 // TODO: Deal with mismatched index constants and variable indexes?
1203 if (!Index)
1204 Index = InsIdx;
1205 else if (InsIdx != *Index)
1206 return false;
1207 VecCs.push_back(VecC);
1208 ScalarOps.push_back(V);
1209 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1210 OpNum, &TTI)) {
1211 VecCs.push_back(Op.get());
1212 ScalarOps.push_back(Op.get());
1213 } else if (match(Op.get(), m_Constant(VecC))) {
1214 VecCs.push_back(VecC);
1215 ScalarOps.push_back(nullptr);
1216 } else {
1217 return false;
1218 }
1219 }
1220
1221 // Bail if all operands are constant.
1222 if (!Index.has_value())
1223 return false;
1224
1225 VectorType *VecTy = cast<VectorType>(I.getType());
1226 Type *ScalarTy = VecTy->getScalarType();
1227 assert(VecTy->isVectorTy() &&
1228 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1229 ScalarTy->isPointerTy()) &&
1230 "Unexpected types for insert element into binop or cmp");
1231
1232 unsigned Opcode = I.getOpcode();
1233 InstructionCost ScalarOpCost, VectorOpCost;
1234 if (CI) {
1235 CmpInst::Predicate Pred = CI->getPredicate();
1236 ScalarOpCost = TTI.getCmpSelInstrCost(
1237 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1238 VectorOpCost = TTI.getCmpSelInstrCost(
1239 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1240 } else if (UO || BO) {
1241 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1242 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1243 } else {
1244 IntrinsicCostAttributes ScalarICA(
1245 II->getIntrinsicID(), ScalarTy,
1246 SmallVector<Type *>(II->arg_size(), ScalarTy));
1247 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1248 IntrinsicCostAttributes VectorICA(
1249 II->getIntrinsicID(), VecTy,
1250 SmallVector<Type *>(II->arg_size(), VecTy));
1251 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1252 }
1253
1254 // Fold the vector constants in the original vectors into a new base vector to
1255 // get more accurate cost modelling.
1256 Value *NewVecC = nullptr;
1257 if (CI)
1258 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1259 else if (UO)
1260 NewVecC =
1261 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1262 else if (BO)
1263 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1264 else if (II)
1265 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1266
1267 if (!NewVecC)
1268 return false;
1269
1270 // Get cost estimate for the insert element. This cost will factor into
1271 // both sequences.
1272 InstructionCost OldCost = VectorOpCost;
1273 InstructionCost NewCost =
1274 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1275 CostKind, *Index, NewVecC);
1276
1277 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1278 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1279 II->getIntrinsicID(), Idx, &TTI)))
1280 continue;
1282 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1283 OldCost += InsertCost;
1284 NewCost += !Op->hasOneUse() * InsertCost;
1285 }
1286
1287 // We want to scalarize unless the vector variant actually has lower cost.
1288 if (OldCost < NewCost || !NewCost.isValid())
1289 return false;
1290
1291 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1292 // inselt NewVecC, (scalar_op V0, V1), Index
1293 if (CI)
1294 ++NumScalarCmp;
1295 else if (UO || BO)
1296 ++NumScalarOps;
1297 else
1298 ++NumScalarIntrinsic;
1299
1300 // For constant cases, extract the scalar element, this should constant fold.
1301 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1302 if (!Scalar)
1304 cast<Constant>(VecC), Builder.getInt64(*Index));
1305
1306 Value *Scalar;
1307 if (CI)
1308 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1309 else if (UO || BO)
1310 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1311 else
1312 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1313
1314 Scalar->setName(I.getName() + ".scalar");
1315
1316 // All IR flags are safe to back-propagate. There is no potential for extra
1317 // poison to be created by the scalar instruction.
1318 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1319 ScalarInst->copyIRFlags(&I);
1320
1321 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1322 replaceValue(I, *Insert);
1323 return true;
1324}
1325
1326/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1327/// a vector into vector operations followed by extract. Note: The SLP pass
1328/// may miss this pattern because of implementation problems.
1329bool VectorCombine::foldExtractedCmps(Instruction &I) {
1330 auto *BI = dyn_cast<BinaryOperator>(&I);
1331
1332 // We are looking for a scalar binop of booleans.
1333 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1334 if (!BI || !I.getType()->isIntegerTy(1))
1335 return false;
1336
1337 // The compare predicates should match, and each compare should have a
1338 // constant operand.
1339 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1340 Instruction *I0, *I1;
1341 Constant *C0, *C1;
1343 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1344 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1345 return false;
1346
1347 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1348 if (!MatchingPred)
1349 return false;
1350
1351 // The compare operands must be extracts of the same vector with constant
1352 // extract indexes.
1353 Value *X;
1354 uint64_t Index0, Index1;
1355 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1356 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1357 return false;
1358
1359 auto *Ext0 = cast<ExtractElementInst>(I0);
1360 auto *Ext1 = cast<ExtractElementInst>(I1);
1361 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1362 if (!ConvertToShuf)
1363 return false;
1364 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1365 "Unknown ExtractElementInst");
1366
1367 // The original scalar pattern is:
1368 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1369 CmpInst::Predicate Pred = *MatchingPred;
1370 unsigned CmpOpcode =
1371 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1372 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1373 if (!VecTy)
1374 return false;
1375
1376 InstructionCost Ext0Cost =
1377 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1378 InstructionCost Ext1Cost =
1379 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1381 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1382 CostKind);
1383
1384 InstructionCost OldCost =
1385 Ext0Cost + Ext1Cost + CmpCost * 2 +
1386 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1387
1388 // The proposed vector pattern is:
1389 // vcmp = cmp Pred X, VecC
1390 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1391 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1392 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1393 auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecTy));
1395 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1396 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1397 ShufMask[CheapIndex] = ExpensiveIndex;
1399 CmpTy, ShufMask, CostKind);
1400 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1401 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1402 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1403 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1404
1405 // Aggressively form vector ops if the cost is equal because the transform
1406 // may enable further optimization.
1407 // Codegen can reverse this transform (scalarize) if it was not profitable.
1408 if (OldCost < NewCost || !NewCost.isValid())
1409 return false;
1410
1411 // Create a vector constant from the 2 scalar constants.
1412 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1413 PoisonValue::get(VecTy->getElementType()));
1414 CmpC[Index0] = C0;
1415 CmpC[Index1] = C1;
1416 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1417 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1418 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1419 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1420 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1421 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1422 replaceValue(I, *NewExt);
1423 ++NumVecCmpBO;
1424 return true;
1425}
1426
1429 const TargetTransformInfo &TTI,
1430 InstructionCost &CostBeforeReduction,
1431 InstructionCost &CostAfterReduction) {
1432 Instruction *Op0, *Op1;
1433 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1434 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1435 unsigned ReductionOpc =
1436 getArithmeticReductionInstruction(II.getIntrinsicID());
1437 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1438 bool IsUnsigned = isa<ZExtInst>(RedOp);
1439 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1440
1441 CostBeforeReduction =
1442 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1444 CostAfterReduction =
1445 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1446 ExtType, FastMathFlags(), CostKind);
1447 return;
1448 }
1449 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1450 match(RedOp,
1452 match(Op0, m_ZExtOrSExt(m_Value())) &&
1453 Op0->getOpcode() == Op1->getOpcode() &&
1454 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1455 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1456 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1457 bool IsUnsigned = isa<ZExtInst>(Op0);
1458 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1459 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1460
1461 InstructionCost ExtCost =
1462 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1464 InstructionCost MulCost =
1465 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1466 InstructionCost Ext2Cost =
1467 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1469
1470 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1471 CostAfterReduction =
1472 TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
1473 return;
1474 }
1475 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1476 std::nullopt, CostKind);
1477}
1478
1479bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1480 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1481 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1482 if (BinOpOpc == Instruction::Sub)
1483 ReductionIID = Intrinsic::vector_reduce_add;
1484 if (ReductionIID == Intrinsic::not_intrinsic)
1485 return false;
1486
1487 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1488 Intrinsic::ID IID) -> Value * {
1489 auto *II = dyn_cast<IntrinsicInst>(V);
1490 if (!II)
1491 return nullptr;
1492 if (II->getIntrinsicID() == IID && II->hasOneUse())
1493 return II->getArgOperand(0);
1494 return nullptr;
1495 };
1496
1497 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1498 if (!V0)
1499 return false;
1500 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1501 if (!V1)
1502 return false;
1503
1504 auto *VTy = cast<VectorType>(V0->getType());
1505 if (V1->getType() != VTy)
1506 return false;
1507 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1508 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1509 unsigned ReductionOpc =
1510 getArithmeticReductionInstruction(II0.getIntrinsicID());
1511
1512 InstructionCost OldCost = 0;
1513 InstructionCost NewCost = 0;
1514 InstructionCost CostOfRedOperand0 = 0;
1515 InstructionCost CostOfRed0 = 0;
1516 InstructionCost CostOfRedOperand1 = 0;
1517 InstructionCost CostOfRed1 = 0;
1518 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1519 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1520 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1521 NewCost =
1522 CostOfRedOperand0 + CostOfRedOperand1 +
1523 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1524 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1525 if (NewCost >= OldCost || !NewCost.isValid())
1526 return false;
1527
1528 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1529 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1530 << "\n");
1531 Value *VectorBO;
1532 if (BinOpOpc == Instruction::Or)
1533 VectorBO = Builder.CreateOr(V0, V1, "",
1534 cast<PossiblyDisjointInst>(I).isDisjoint());
1535 else
1536 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1537
1538 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1539 replaceValue(I, *Rdx);
1540 return true;
1541}
1542
1543// Check if memory loc modified between two instrs in the same BB
1546 const MemoryLocation &Loc, AAResults &AA) {
1547 unsigned NumScanned = 0;
1548 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1549 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1550 ++NumScanned > MaxInstrsToScan;
1551 });
1552}
1553
1554namespace {
1555/// Helper class to indicate whether a vector index can be safely scalarized and
1556/// if a freeze needs to be inserted.
1557class ScalarizationResult {
1558 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1559
1560 StatusTy Status;
1561 Value *ToFreeze;
1562
1563 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1564 : Status(Status), ToFreeze(ToFreeze) {}
1565
1566public:
1567 ScalarizationResult(const ScalarizationResult &Other) = default;
1568 ~ScalarizationResult() {
1569 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1570 }
1571
1572 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1573 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1574 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1575 return {StatusTy::SafeWithFreeze, ToFreeze};
1576 }
1577
1578 /// Returns true if the index can be scalarize without requiring a freeze.
1579 bool isSafe() const { return Status == StatusTy::Safe; }
1580 /// Returns true if the index cannot be scalarized.
1581 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1582 /// Returns true if the index can be scalarize, but requires inserting a
1583 /// freeze.
1584 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1585
1586 /// Reset the state of Unsafe and clear ToFreze if set.
1587 void discard() {
1588 ToFreeze = nullptr;
1589 Status = StatusTy::Unsafe;
1590 }
1591
1592 /// Freeze the ToFreeze and update the use in \p User to use it.
1593 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1594 assert(isSafeWithFreeze() &&
1595 "should only be used when freezing is required");
1596 assert(is_contained(ToFreeze->users(), &UserI) &&
1597 "UserI must be a user of ToFreeze");
1598 IRBuilder<>::InsertPointGuard Guard(Builder);
1599 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1600 Value *Frozen =
1601 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1602 for (Use &U : make_early_inc_range((UserI.operands())))
1603 if (U.get() == ToFreeze)
1604 U.set(Frozen);
1605
1606 ToFreeze = nullptr;
1607 }
1608};
1609} // namespace
1610
1611/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1612/// Idx. \p Idx must access a valid vector element.
1613static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1614 Instruction *CtxI,
1615 AssumptionCache &AC,
1616 const DominatorTree &DT) {
1617 // We do checks for both fixed vector types and scalable vector types.
1618 // This is the number of elements of fixed vector types,
1619 // or the minimum number of elements of scalable vector types.
1620 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1621 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1622
1623 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1624 if (C->getValue().ult(NumElements))
1625 return ScalarizationResult::safe();
1626 return ScalarizationResult::unsafe();
1627 }
1628
1629 // Always unsafe if the index type can't handle all inbound values.
1630 if (!llvm::isUIntN(IntWidth, NumElements))
1631 return ScalarizationResult::unsafe();
1632
1633 APInt Zero(IntWidth, 0);
1634 APInt MaxElts(IntWidth, NumElements);
1635 ConstantRange ValidIndices(Zero, MaxElts);
1636 ConstantRange IdxRange(IntWidth, true);
1637
1638 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1639 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1640 true, &AC, CtxI, &DT)))
1641 return ScalarizationResult::safe();
1642 return ScalarizationResult::unsafe();
1643 }
1644
1645 // If the index may be poison, check if we can insert a freeze before the
1646 // range of the index is restricted.
1647 Value *IdxBase;
1648 ConstantInt *CI;
1649 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1650 IdxRange = IdxRange.binaryAnd(CI->getValue());
1651 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1652 IdxRange = IdxRange.urem(CI->getValue());
1653 }
1654
1655 if (ValidIndices.contains(IdxRange))
1656 return ScalarizationResult::safeWithFreeze(IdxBase);
1657 return ScalarizationResult::unsafe();
1658}
1659
1660/// The memory operation on a vector of \p ScalarType had alignment of
1661/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1662/// alignment that will be valid for the memory operation on a single scalar
1663/// element of the same type with index \p Idx.
1665 Type *ScalarType, Value *Idx,
1666 const DataLayout &DL) {
1667 if (auto *C = dyn_cast<ConstantInt>(Idx))
1668 return commonAlignment(VectorAlignment,
1669 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1670 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1671}
1672
1673// Combine patterns like:
1674// %0 = load <4 x i32>, <4 x i32>* %a
1675// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1676// store <4 x i32> %1, <4 x i32>* %a
1677// to:
1678// %0 = bitcast <4 x i32>* %a to i32*
1679// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1680// store i32 %b, i32* %1
1681bool VectorCombine::foldSingleElementStore(Instruction &I) {
1683 return false;
1684 auto *SI = cast<StoreInst>(&I);
1685 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1686 return false;
1687
1688 // TODO: Combine more complicated patterns (multiple insert) by referencing
1689 // TargetTransformInfo.
1691 Value *NewElement;
1692 Value *Idx;
1693 if (!match(SI->getValueOperand(),
1694 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1695 m_Value(Idx))))
1696 return false;
1697
1698 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1699 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1700 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1701 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1702 // modified between, vector type matches store size, and index is inbounds.
1703 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1704 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1705 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1706 return false;
1707
1708 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1709 if (ScalarizableIdx.isUnsafe() ||
1710 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1711 MemoryLocation::get(SI), AA))
1712 return false;
1713
1714 // Ensure we add the load back to the worklist BEFORE its users so they can
1715 // erased in the correct order.
1716 Worklist.push(Load);
1717
1718 if (ScalarizableIdx.isSafeWithFreeze())
1719 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1720 Value *GEP = Builder.CreateInBoundsGEP(
1721 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1722 {ConstantInt::get(Idx->getType(), 0), Idx});
1723 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1724 NSI->copyMetadata(*SI);
1725 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1726 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1727 *DL);
1728 NSI->setAlignment(ScalarOpAlignment);
1729 replaceValue(I, *NSI);
1731 return true;
1732 }
1733
1734 return false;
1735}
1736
1737/// Try to scalarize vector loads feeding extractelement instructions.
1738bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1740 return false;
1741
1742 Value *Ptr;
1743 if (!match(&I, m_Load(m_Value(Ptr))))
1744 return false;
1745
1746 auto *LI = cast<LoadInst>(&I);
1747 auto *VecTy = cast<VectorType>(LI->getType());
1748 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1749 return false;
1750
1751 InstructionCost OriginalCost =
1752 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1753 LI->getPointerAddressSpace(), CostKind);
1754 InstructionCost ScalarizedCost = 0;
1755
1756 Instruction *LastCheckedInst = LI;
1757 unsigned NumInstChecked = 0;
1759 auto FailureGuard = make_scope_exit([&]() {
1760 // If the transform is aborted, discard the ScalarizationResults.
1761 for (auto &Pair : NeedFreeze)
1762 Pair.second.discard();
1763 });
1764
1765 // Check if all users of the load are extracts with no memory modifications
1766 // between the load and the extract. Compute the cost of both the original
1767 // code and the scalarized version.
1768 for (User *U : LI->users()) {
1769 auto *UI = dyn_cast<ExtractElementInst>(U);
1770 if (!UI || UI->getParent() != LI->getParent())
1771 return false;
1772
1773 // If any extract is waiting to be erased, then bail out as this will
1774 // distort the cost calculation and possibly lead to infinite loops.
1775 if (UI->use_empty())
1776 return false;
1777
1778 // Check if any instruction between the load and the extract may modify
1779 // memory.
1780 if (LastCheckedInst->comesBefore(UI)) {
1781 for (Instruction &I :
1782 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1783 // Bail out if we reached the check limit or the instruction may write
1784 // to memory.
1785 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1786 return false;
1787 NumInstChecked++;
1788 }
1789 LastCheckedInst = UI;
1790 }
1791
1792 auto ScalarIdx =
1793 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1794 if (ScalarIdx.isUnsafe())
1795 return false;
1796 if (ScalarIdx.isSafeWithFreeze()) {
1797 NeedFreeze.try_emplace(UI, ScalarIdx);
1798 ScalarIdx.discard();
1799 }
1800
1801 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1802 OriginalCost +=
1803 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1804 Index ? Index->getZExtValue() : -1);
1805 ScalarizedCost +=
1806 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1807 Align(1), LI->getPointerAddressSpace(), CostKind);
1808 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1809 nullptr, nullptr, CostKind);
1810 }
1811
1812 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
1813 << "\n LoadExtractCost: " << OriginalCost
1814 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1815
1816 if (ScalarizedCost >= OriginalCost)
1817 return false;
1818
1819 // Ensure we add the load back to the worklist BEFORE its users so they can
1820 // erased in the correct order.
1821 Worklist.push(LI);
1822
1823 Type *ElemType = VecTy->getElementType();
1824
1825 // Replace extracts with narrow scalar loads.
1826 for (User *U : LI->users()) {
1827 auto *EI = cast<ExtractElementInst>(U);
1828 Value *Idx = EI->getIndexOperand();
1829
1830 // Insert 'freeze' for poison indexes.
1831 auto It = NeedFreeze.find(EI);
1832 if (It != NeedFreeze.end())
1833 It->second.freeze(Builder, *cast<Instruction>(Idx));
1834
1835 Builder.SetInsertPoint(EI);
1836 Value *GEP =
1837 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1838 auto *NewLoad = cast<LoadInst>(
1839 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1840
1841 Align ScalarOpAlignment =
1842 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1843 NewLoad->setAlignment(ScalarOpAlignment);
1844
1845 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1846 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1847 AAMDNodes OldAAMD = LI->getAAMetadata();
1848 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1849 }
1850
1851 replaceValue(*EI, *NewLoad, false);
1852 }
1853
1854 FailureGuard.release();
1855 return true;
1856}
1857
1858bool VectorCombine::scalarizeExtExtract(Instruction &I) {
1860 return false;
1861 auto *Ext = dyn_cast<ZExtInst>(&I);
1862 if (!Ext)
1863 return false;
1864
1865 // Try to convert a vector zext feeding only extracts to a set of scalar
1866 // (Src << ExtIdx *Size) & (Size -1)
1867 // if profitable .
1868 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
1869 if (!SrcTy)
1870 return false;
1871 auto *DstTy = cast<FixedVectorType>(Ext->getType());
1872
1873 Type *ScalarDstTy = DstTy->getElementType();
1874 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
1875 return false;
1876
1877 InstructionCost VectorCost =
1878 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
1880 unsigned ExtCnt = 0;
1881 bool ExtLane0 = false;
1882 for (User *U : Ext->users()) {
1883 uint64_t Idx;
1885 return false;
1886 if (cast<Instruction>(U)->use_empty())
1887 continue;
1888 ExtCnt += 1;
1889 ExtLane0 |= !Idx;
1890 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
1891 CostKind, Idx, U);
1892 }
1893
1894 InstructionCost ScalarCost =
1895 ExtCnt * TTI.getArithmeticInstrCost(
1896 Instruction::And, ScalarDstTy, CostKind,
1899 (ExtCnt - ExtLane0) *
1901 Instruction::LShr, ScalarDstTy, CostKind,
1904 if (ScalarCost > VectorCost)
1905 return false;
1906
1907 Value *ScalarV = Ext->getOperand(0);
1908 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
1909 &DT))
1910 ScalarV = Builder.CreateFreeze(ScalarV);
1911 ScalarV = Builder.CreateBitCast(
1912 ScalarV,
1913 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
1914 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
1915 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
1916 for (User *U : Ext->users()) {
1917 auto *Extract = cast<ExtractElementInst>(U);
1918 uint64_t Idx =
1919 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
1920 Value *LShr = Builder.CreateLShr(ScalarV, Idx * SrcEltSizeInBits);
1921 Value *And = Builder.CreateAnd(LShr, EltBitMask);
1922 U->replaceAllUsesWith(And);
1923 }
1924 return true;
1925}
1926
1927/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
1928/// to "(bitcast (concat X, Y))"
1929/// where X/Y are bitcasted from i1 mask vectors.
1930bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
1931 Type *Ty = I.getType();
1932 if (!Ty->isIntegerTy())
1933 return false;
1934
1935 // TODO: Add big endian test coverage
1936 if (DL->isBigEndian())
1937 return false;
1938
1939 // Restrict to disjoint cases so the mask vectors aren't overlapping.
1940 Instruction *X, *Y;
1942 return false;
1943
1944 // Allow both sources to contain shl, to handle more generic pattern:
1945 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
1946 Value *SrcX;
1947 uint64_t ShAmtX = 0;
1948 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
1949 !match(X, m_OneUse(
1951 m_ConstantInt(ShAmtX)))))
1952 return false;
1953
1954 Value *SrcY;
1955 uint64_t ShAmtY = 0;
1956 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
1957 !match(Y, m_OneUse(
1959 m_ConstantInt(ShAmtY)))))
1960 return false;
1961
1962 // Canonicalize larger shift to the RHS.
1963 if (ShAmtX > ShAmtY) {
1964 std::swap(X, Y);
1965 std::swap(SrcX, SrcY);
1966 std::swap(ShAmtX, ShAmtY);
1967 }
1968
1969 // Ensure both sources are matching vXi1 bool mask types, and that the shift
1970 // difference is the mask width so they can be easily concatenated together.
1971 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
1972 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
1973 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
1974 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
1975 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
1976 !MaskTy->getElementType()->isIntegerTy(1) ||
1977 MaskTy->getNumElements() != ShAmtDiff ||
1978 MaskTy->getNumElements() > (BitWidth / 2))
1979 return false;
1980
1981 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
1982 auto *ConcatIntTy =
1983 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
1984 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
1985
1986 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
1987 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
1988
1989 // TODO: Is it worth supporting multi use cases?
1990 InstructionCost OldCost = 0;
1991 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
1992 OldCost +=
1993 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
1994 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
1996 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
1998
1999 InstructionCost NewCost = 0;
2001 MaskTy, ConcatMask, CostKind);
2002 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2004 if (Ty != ConcatIntTy)
2005 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2007 if (ShAmtX > 0)
2008 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2009
2010 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2011 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2012 << "\n");
2013
2014 if (NewCost > OldCost)
2015 return false;
2016
2017 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2018 // any residual zero-extension or shifting.
2019 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2020 Worklist.pushValue(Concat);
2021
2022 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2023
2024 if (Ty != ConcatIntTy) {
2025 Worklist.pushValue(Result);
2026 Result = Builder.CreateZExt(Result, Ty);
2027 }
2028
2029 if (ShAmtX > 0) {
2030 Worklist.pushValue(Result);
2031 Result = Builder.CreateShl(Result, ShAmtX);
2032 }
2033
2034 replaceValue(I, *Result);
2035 return true;
2036}
2037
2038/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2039/// --> "binop (shuffle), (shuffle)".
2040bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2041 BinaryOperator *BinOp;
2042 ArrayRef<int> OuterMask;
2043 if (!match(&I,
2044 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2045 return false;
2046
2047 // Don't introduce poison into div/rem.
2048 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2049 return false;
2050
2051 Value *Op00, *Op01, *Op10, *Op11;
2052 ArrayRef<int> Mask0, Mask1;
2053 bool Match0 =
2054 match(BinOp->getOperand(0),
2055 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2056 bool Match1 =
2057 match(BinOp->getOperand(1),
2058 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2059 if (!Match0 && !Match1)
2060 return false;
2061
2062 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2063 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2064 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2065 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2066
2067 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2068 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2069 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2070 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2071 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2072 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2073 return false;
2074
2075 unsigned NumSrcElts = BinOpTy->getNumElements();
2076
2077 // Don't accept shuffles that reference the second operand in
2078 // div/rem or if its an undef arg.
2079 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2080 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2081 return false;
2082
2083 // Merge outer / inner (or identity if no match) shuffles.
2084 SmallVector<int> NewMask0, NewMask1;
2085 for (int M : OuterMask) {
2086 if (M < 0 || M >= (int)NumSrcElts) {
2087 NewMask0.push_back(PoisonMaskElem);
2088 NewMask1.push_back(PoisonMaskElem);
2089 } else {
2090 NewMask0.push_back(Match0 ? Mask0[M] : M);
2091 NewMask1.push_back(Match1 ? Mask1[M] : M);
2092 }
2093 }
2094
2095 unsigned NumOpElts = Op0Ty->getNumElements();
2096 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2097 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2098 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2099 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2100 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2101 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2102
2103 // Try to merge shuffles across the binop if the new shuffles are not costly.
2104 InstructionCost OldCost =
2105 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2107 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2108 if (Match0)
2109 OldCost += TTI.getShuffleCost(
2110 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2111 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2112 if (Match1)
2113 OldCost += TTI.getShuffleCost(
2114 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2115 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2116
2117 InstructionCost NewCost =
2118 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2119
2120 if (!IsIdentity0)
2121 NewCost +=
2123 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2124 if (!IsIdentity1)
2125 NewCost +=
2127 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2128
2129 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2130 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2131 << "\n");
2132
2133 // If costs are equal, still fold as we reduce instruction count.
2134 if (NewCost > OldCost)
2135 return false;
2136
2137 Value *LHS =
2138 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2139 Value *RHS =
2140 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2141 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2142
2143 // Intersect flags from the old binops.
2144 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2145 NewInst->copyIRFlags(BinOp);
2146
2147 Worklist.pushValue(LHS);
2148 Worklist.pushValue(RHS);
2149 replaceValue(I, *NewBO);
2150 return true;
2151}
2152
2153/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2154/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2155bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2156 ArrayRef<int> OldMask;
2157 Instruction *LHS, *RHS;
2158 if (!match(&I, m_Shuffle(m_OneUse(m_Instruction(LHS)),
2159 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2160 return false;
2161
2162 // TODO: Add support for addlike etc.
2163 if (LHS->getOpcode() != RHS->getOpcode())
2164 return false;
2165
2166 Value *X, *Y, *Z, *W;
2167 bool IsCommutative = false;
2170 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2171 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2172 auto *BO = cast<BinaryOperator>(LHS);
2173 // Don't introduce poison into div/rem.
2174 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2175 return false;
2176 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2177 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2178 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2179 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2180 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2181 } else
2182 return false;
2183
2184 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2185 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2186 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2187 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2188 return false;
2189
2190 unsigned NumSrcElts = BinOpTy->getNumElements();
2191
2192 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2193 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2194 std::swap(X, Y);
2195
2196 auto ConvertToUnary = [NumSrcElts](int &M) {
2197 if (M >= (int)NumSrcElts)
2198 M -= NumSrcElts;
2199 };
2200
2201 SmallVector<int> NewMask0(OldMask);
2203 if (X == Z) {
2204 llvm::for_each(NewMask0, ConvertToUnary);
2206 Z = PoisonValue::get(BinOpTy);
2207 }
2208
2209 SmallVector<int> NewMask1(OldMask);
2211 if (Y == W) {
2212 llvm::for_each(NewMask1, ConvertToUnary);
2214 W = PoisonValue::get(BinOpTy);
2215 }
2216
2217 // Try to replace a binop with a shuffle if the shuffle is not costly.
2218 InstructionCost OldCost =
2222 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2223 &I);
2224
2225 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2226 // where one use shuffles have gotten split across the binop/cmp. These
2227 // often allow a major reduction in total cost that wouldn't happen as
2228 // individual folds.
2229 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2230 TTI::TargetCostKind CostKind) -> bool {
2231 Value *InnerOp;
2232 ArrayRef<int> InnerMask;
2233 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2234 m_Mask(InnerMask)))) &&
2235 InnerOp->getType() == Op->getType() &&
2236 all_of(InnerMask,
2237 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2238 for (int &M : Mask)
2239 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2240 M = InnerMask[M - Offset];
2241 M = 0 <= M ? M + Offset : M;
2242 }
2243 OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind);
2244 Op = InnerOp;
2245 return true;
2246 }
2247 return false;
2248 };
2249 bool ReducedInstCount = false;
2250 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2251 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2252 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2253 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2254
2255 auto *ShuffleCmpTy =
2256 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2257 InstructionCost NewCost =
2258 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2259 nullptr, {X, Z}) +
2260 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2261 nullptr, {Y, W});
2262
2263 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2264 NewCost +=
2265 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2266 } else {
2267 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2268 ShuffleDstTy, PredLHS, CostKind);
2269 }
2270
2271 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2272 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2273 << "\n");
2274
2275 // If either shuffle will constant fold away, then fold for the same cost as
2276 // we will reduce the instruction count.
2277 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2278 (isa<Constant>(Y) && isa<Constant>(W));
2279 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2280 return false;
2281
2282 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2283 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2284 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2285 ? Builder.CreateBinOp(
2286 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2287 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2288
2289 // Intersect flags from the old binops.
2290 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2291 NewInst->copyIRFlags(LHS);
2292 NewInst->andIRFlags(RHS);
2293 }
2294
2295 Worklist.pushValue(Shuf0);
2296 Worklist.pushValue(Shuf1);
2297 replaceValue(I, *NewBO);
2298 return true;
2299}
2300
2301/// Try to convert,
2302/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2303/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2304bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2306 Value *C1, *T1, *F1, *C2, *T2, *F2;
2307 if (!match(&I, m_Shuffle(
2308 m_OneUse(m_Select(m_Value(C1), m_Value(T1), m_Value(F1))),
2309 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2310 m_Mask(Mask))))
2311 return false;
2312
2313 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2314 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2315 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2316 return false;
2317
2318 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2319 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2320 // SelectInsts must have the same FMF.
2321 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2322 ((SI0FOp != nullptr) &&
2323 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2324 return false;
2325
2326 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2327 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2329 auto SelOp = Instruction::Select;
2331 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2332 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2334 OldCost +=
2335 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2336 {I.getOperand(0), I.getOperand(1)}, &I);
2337
2339 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2340 Mask, CostKind, 0, nullptr, {C1, C2});
2341 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2342 nullptr, {T1, T2});
2343 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2344 nullptr, {F1, F2});
2345 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2346 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2347 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2349
2350 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2351 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2352 << "\n");
2353 if (NewCost > OldCost)
2354 return false;
2355
2356 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2357 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2358 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2359 Value *NewSel;
2360 // We presuppose that the SelectInsts have the same FMF.
2361 if (SI0FOp)
2362 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2363 SI0FOp->getFastMathFlags());
2364 else
2365 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2366
2367 Worklist.pushValue(ShuffleCmp);
2368 Worklist.pushValue(ShuffleTrue);
2369 Worklist.pushValue(ShuffleFalse);
2370 replaceValue(I, *NewSel);
2371 return true;
2372}
2373
2374/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2375/// into "castop (shuffle)".
2376bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2377 Value *V0, *V1;
2378 ArrayRef<int> OldMask;
2379 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2380 return false;
2381
2382 auto *C0 = dyn_cast<CastInst>(V0);
2383 auto *C1 = dyn_cast<CastInst>(V1);
2384 if (!C0 || !C1)
2385 return false;
2386
2387 Instruction::CastOps Opcode = C0->getOpcode();
2388 if (C0->getSrcTy() != C1->getSrcTy())
2389 return false;
2390
2391 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2392 if (Opcode != C1->getOpcode()) {
2393 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2394 Opcode = Instruction::SExt;
2395 else
2396 return false;
2397 }
2398
2399 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2400 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2401 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2402 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2403 return false;
2404
2405 unsigned NumSrcElts = CastSrcTy->getNumElements();
2406 unsigned NumDstElts = CastDstTy->getNumElements();
2407 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2408 "Only bitcasts expected to alter src/dst element counts");
2409
2410 // Check for bitcasting of unscalable vector types.
2411 // e.g. <32 x i40> -> <40 x i32>
2412 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2413 (NumDstElts % NumSrcElts) != 0)
2414 return false;
2415
2416 SmallVector<int, 16> NewMask;
2417 if (NumSrcElts >= NumDstElts) {
2418 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2419 // always be expanded to the equivalent form choosing narrower elements.
2420 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2421 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2422 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2423 } else {
2424 // The bitcast is from narrow elements to wide elements. The shuffle mask
2425 // must choose consecutive elements to allow casting first.
2426 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2427 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2428 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2429 return false;
2430 }
2431
2432 auto *NewShuffleDstTy =
2433 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2434
2435 // Try to replace a castop with a shuffle if the shuffle is not costly.
2436 InstructionCost CostC0 =
2437 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2439 InstructionCost CostC1 =
2440 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2442 InstructionCost OldCost = CostC0 + CostC1;
2443 OldCost +=
2445 CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
2446
2447 InstructionCost NewCost =
2449 CastSrcTy, NewMask, CostKind);
2450 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2452 if (!C0->hasOneUse())
2453 NewCost += CostC0;
2454 if (!C1->hasOneUse())
2455 NewCost += CostC1;
2456
2457 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2458 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2459 << "\n");
2460 if (NewCost > OldCost)
2461 return false;
2462
2463 Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
2464 C1->getOperand(0), NewMask);
2465 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2466
2467 // Intersect flags from the old casts.
2468 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2469 NewInst->copyIRFlags(C0);
2470 NewInst->andIRFlags(C1);
2471 }
2472
2473 Worklist.pushValue(Shuf);
2474 replaceValue(I, *Cast);
2475 return true;
2476}
2477
2478/// Try to convert any of:
2479/// "shuffle (shuffle x, y), (shuffle y, x)"
2480/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2481/// "shuffle (shuffle x, undef), y"
2482/// "shuffle x, (shuffle y, undef)"
2483/// into "shuffle x, y".
2484bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2485 ArrayRef<int> OuterMask;
2486 Value *OuterV0, *OuterV1;
2487 if (!match(&I,
2488 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2489 return false;
2490
2491 ArrayRef<int> InnerMask0, InnerMask1;
2492 Value *X0, *X1, *Y0, *Y1;
2493 bool Match0 =
2494 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2495 bool Match1 =
2496 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2497 if (!Match0 && !Match1)
2498 return false;
2499
2500 // If the outer shuffle is a permute, then create a fake inner all-poison
2501 // shuffle. This is easier than accounting for length-changing shuffles below.
2502 SmallVector<int, 16> PoisonMask1;
2503 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2504 X1 = X0;
2505 Y1 = Y0;
2506 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2507 InnerMask1 = PoisonMask1;
2508 Match1 = true; // fake match
2509 }
2510
2511 X0 = Match0 ? X0 : OuterV0;
2512 Y0 = Match0 ? Y0 : OuterV0;
2513 X1 = Match1 ? X1 : OuterV1;
2514 Y1 = Match1 ? Y1 : OuterV1;
2515 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2516 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2517 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2518 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2519 X0->getType() != X1->getType())
2520 return false;
2521
2522 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2523 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2524
2525 // Attempt to merge shuffles, matching upto 2 source operands.
2526 // Replace index to a poison arg with PoisonMaskElem.
2527 // Bail if either inner masks reference an undef arg.
2528 SmallVector<int, 16> NewMask(OuterMask);
2529 Value *NewX = nullptr, *NewY = nullptr;
2530 for (int &M : NewMask) {
2531 Value *Src = nullptr;
2532 if (0 <= M && M < (int)NumImmElts) {
2533 Src = OuterV0;
2534 if (Match0) {
2535 M = InnerMask0[M];
2536 Src = M >= (int)NumSrcElts ? Y0 : X0;
2537 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2538 }
2539 } else if (M >= (int)NumImmElts) {
2540 Src = OuterV1;
2541 M -= NumImmElts;
2542 if (Match1) {
2543 M = InnerMask1[M];
2544 Src = M >= (int)NumSrcElts ? Y1 : X1;
2545 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2546 }
2547 }
2548 if (Src && M != PoisonMaskElem) {
2549 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2550 if (isa<UndefValue>(Src)) {
2551 // We've referenced an undef element - if its poison, update the shuffle
2552 // mask, else bail.
2553 if (!isa<PoisonValue>(Src))
2554 return false;
2555 M = PoisonMaskElem;
2556 continue;
2557 }
2558 if (!NewX || NewX == Src) {
2559 NewX = Src;
2560 continue;
2561 }
2562 if (!NewY || NewY == Src) {
2563 M += NumSrcElts;
2564 NewY = Src;
2565 continue;
2566 }
2567 return false;
2568 }
2569 }
2570
2571 if (!NewX)
2572 return PoisonValue::get(ShuffleDstTy);
2573 if (!NewY)
2574 NewY = PoisonValue::get(ShuffleSrcTy);
2575
2576 // Have we folded to an Identity shuffle?
2577 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2578 replaceValue(I, *NewX);
2579 return true;
2580 }
2581
2582 // Try to merge the shuffles if the new shuffle is not costly.
2583 InstructionCost InnerCost0 = 0;
2584 if (Match0)
2585 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2586
2587 InstructionCost InnerCost1 = 0;
2588 if (Match1)
2589 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2590
2592
2593 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2594
2595 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2599 InstructionCost NewCost =
2600 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2601 nullptr, {NewX, NewY});
2602 if (!OuterV0->hasOneUse())
2603 NewCost += InnerCost0;
2604 if (!OuterV1->hasOneUse())
2605 NewCost += InnerCost1;
2606
2607 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2608 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2609 << "\n");
2610 if (NewCost > OldCost)
2611 return false;
2612
2613 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2614 replaceValue(I, *Shuf);
2615 return true;
2616}
2617
2618/// Try to convert
2619/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2620bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2621 Value *V0, *V1;
2622 ArrayRef<int> OldMask;
2623 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2624 m_Mask(OldMask))))
2625 return false;
2626
2627 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2628 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2629 if (!II0 || !II1)
2630 return false;
2631
2632 Intrinsic::ID IID = II0->getIntrinsicID();
2633 if (IID != II1->getIntrinsicID())
2634 return false;
2635
2636 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2637 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2638 if (!ShuffleDstTy || !II0Ty)
2639 return false;
2640
2641 if (!isTriviallyVectorizable(IID))
2642 return false;
2643
2644 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2646 II0->getArgOperand(I) != II1->getArgOperand(I))
2647 return false;
2648
2649 InstructionCost OldCost =
2653 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2654
2655 SmallVector<Type *> NewArgsTy;
2656 InstructionCost NewCost = 0;
2657 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2659 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2660 } else {
2661 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2662 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2663 ShuffleDstTy->getNumElements());
2664 NewArgsTy.push_back(ArgTy);
2666 ArgTy, VecTy, OldMask, CostKind);
2667 }
2668 }
2669 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2670 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2671
2672 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2673 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2674 << "\n");
2675
2676 if (NewCost > OldCost)
2677 return false;
2678
2679 SmallVector<Value *> NewArgs;
2680 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2682 NewArgs.push_back(II0->getArgOperand(I));
2683 } else {
2684 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2685 II1->getArgOperand(I), OldMask);
2686 NewArgs.push_back(Shuf);
2687 Worklist.pushValue(Shuf);
2688 }
2689 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2690
2691 // Intersect flags from the old intrinsics.
2692 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2693 NewInst->copyIRFlags(II0);
2694 NewInst->andIRFlags(II1);
2695 }
2696
2697 replaceValue(I, *NewIntrinsic);
2698 return true;
2699}
2700
2701using InstLane = std::pair<Use *, int>;
2702
2703static InstLane lookThroughShuffles(Use *U, int Lane) {
2704 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2705 unsigned NumElts =
2706 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2707 int M = SV->getMaskValue(Lane);
2708 if (M < 0)
2709 return {nullptr, PoisonMaskElem};
2710 if (static_cast<unsigned>(M) < NumElts) {
2711 U = &SV->getOperandUse(0);
2712 Lane = M;
2713 } else {
2714 U = &SV->getOperandUse(1);
2715 Lane = M - NumElts;
2716 }
2717 }
2718 return InstLane{U, Lane};
2719}
2720
2724 for (InstLane IL : Item) {
2725 auto [U, Lane] = IL;
2726 InstLane OpLane =
2727 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2728 Lane)
2729 : InstLane{nullptr, PoisonMaskElem};
2730 NItem.emplace_back(OpLane);
2731 }
2732 return NItem;
2733}
2734
2735/// Detect concat of multiple values into a vector
2737 const TargetTransformInfo &TTI) {
2738 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
2739 unsigned NumElts = Ty->getNumElements();
2740 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
2741 return false;
2742
2743 // Check that the concat is free, usually meaning that the type will be split
2744 // during legalization.
2745 SmallVector<int, 16> ConcatMask(NumElts * 2);
2746 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2748 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
2749 Ty, ConcatMask, CostKind) != 0)
2750 return false;
2751
2752 unsigned NumSlices = Item.size() / NumElts;
2753 // Currently we generate a tree of shuffles for the concats, which limits us
2754 // to a power2.
2755 if (!isPowerOf2_32(NumSlices))
2756 return false;
2757 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
2758 Use *SliceV = Item[Slice * NumElts].first;
2759 if (!SliceV || SliceV->get()->getType() != Ty)
2760 return false;
2761 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
2762 auto [V, Lane] = Item[Slice * NumElts + Elt];
2763 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
2764 return false;
2765 }
2766 }
2767 return true;
2768}
2769
2771 const SmallPtrSet<Use *, 4> &IdentityLeafs,
2772 const SmallPtrSet<Use *, 4> &SplatLeafs,
2773 const SmallPtrSet<Use *, 4> &ConcatLeafs,
2774 IRBuilderBase &Builder,
2775 const TargetTransformInfo *TTI) {
2776 auto [FrontU, FrontLane] = Item.front();
2777
2778 if (IdentityLeafs.contains(FrontU)) {
2779 return FrontU->get();
2780 }
2781 if (SplatLeafs.contains(FrontU)) {
2782 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
2783 return Builder.CreateShuffleVector(FrontU->get(), Mask);
2784 }
2785 if (ConcatLeafs.contains(FrontU)) {
2786 unsigned NumElts =
2787 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
2788 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
2789 for (unsigned S = 0; S < Values.size(); ++S)
2790 Values[S] = Item[S * NumElts].first->get();
2791
2792 while (Values.size() > 1) {
2793 NumElts *= 2;
2794 SmallVector<int, 16> Mask(NumElts, 0);
2795 std::iota(Mask.begin(), Mask.end(), 0);
2796 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
2797 for (unsigned S = 0; S < NewValues.size(); ++S)
2798 NewValues[S] =
2799 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
2800 Values = NewValues;
2801 }
2802 return Values[0];
2803 }
2804
2805 auto *I = cast<Instruction>(FrontU->get());
2806 auto *II = dyn_cast<IntrinsicInst>(I);
2807 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
2808 SmallVector<Value *> Ops(NumOps);
2809 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
2810 if (II &&
2811 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
2812 Ops[Idx] = II->getOperand(Idx);
2813 continue;
2814 }
2816 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
2817 Builder, TTI);
2818 }
2819
2820 SmallVector<Value *, 8> ValueList;
2821 for (const auto &Lane : Item)
2822 if (Lane.first)
2823 ValueList.push_back(Lane.first->get());
2824
2825 Type *DstTy =
2826 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
2827 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
2828 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
2829 Ops[0], Ops[1]);
2830 propagateIRFlags(Value, ValueList);
2831 return Value;
2832 }
2833 if (auto *CI = dyn_cast<CmpInst>(I)) {
2834 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
2835 propagateIRFlags(Value, ValueList);
2836 return Value;
2837 }
2838 if (auto *SI = dyn_cast<SelectInst>(I)) {
2839 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
2840 propagateIRFlags(Value, ValueList);
2841 return Value;
2842 }
2843 if (auto *CI = dyn_cast<CastInst>(I)) {
2844 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
2845 propagateIRFlags(Value, ValueList);
2846 return Value;
2847 }
2848 if (II) {
2849 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
2850 propagateIRFlags(Value, ValueList);
2851 return Value;
2852 }
2853 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
2854 auto *Value =
2855 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
2856 propagateIRFlags(Value, ValueList);
2857 return Value;
2858}
2859
2860// Starting from a shuffle, look up through operands tracking the shuffled index
2861// of each lane. If we can simplify away the shuffles to identities then
2862// do so.
2863bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
2864 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
2865 if (!Ty || I.use_empty())
2866 return false;
2867
2868 SmallVector<InstLane> Start(Ty->getNumElements());
2869 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
2870 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
2871
2873 Worklist.push_back(Start);
2874 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
2875 unsigned NumVisited = 0;
2876
2877 while (!Worklist.empty()) {
2878 if (++NumVisited > MaxInstrsToScan)
2879 return false;
2880
2881 SmallVector<InstLane> Item = Worklist.pop_back_val();
2882 auto [FrontU, FrontLane] = Item.front();
2883
2884 // If we found an undef first lane then bail out to keep things simple.
2885 if (!FrontU)
2886 return false;
2887
2888 // Helper to peek through bitcasts to the same value.
2889 auto IsEquiv = [&](Value *X, Value *Y) {
2890 return X->getType() == Y->getType() &&
2892 };
2893
2894 // Look for an identity value.
2895 if (FrontLane == 0 &&
2896 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
2897 Ty->getNumElements() &&
2898 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
2899 Value *FrontV = Item.front().first->get();
2900 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
2901 E.value().second == (int)E.index());
2902 })) {
2903 IdentityLeafs.insert(FrontU);
2904 continue;
2905 }
2906 // Look for constants, for the moment only supporting constant splats.
2907 if (auto *C = dyn_cast<Constant>(FrontU);
2908 C && C->getSplatValue() &&
2909 all_of(drop_begin(Item), [Item](InstLane &IL) {
2910 Value *FrontV = Item.front().first->get();
2911 Use *U = IL.first;
2912 return !U || (isa<Constant>(U->get()) &&
2913 cast<Constant>(U->get())->getSplatValue() ==
2914 cast<Constant>(FrontV)->getSplatValue());
2915 })) {
2916 SplatLeafs.insert(FrontU);
2917 continue;
2918 }
2919 // Look for a splat value.
2920 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
2921 auto [FrontU, FrontLane] = Item.front();
2922 auto [U, Lane] = IL;
2923 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
2924 })) {
2925 SplatLeafs.insert(FrontU);
2926 continue;
2927 }
2928
2929 // We need each element to be the same type of value, and check that each
2930 // element has a single use.
2931 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
2932 Value *FrontV = Item.front().first->get();
2933 if (!IL.first)
2934 return true;
2935 Value *V = IL.first->get();
2936 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
2937 return false;
2938 if (V->getValueID() != FrontV->getValueID())
2939 return false;
2940 if (auto *CI = dyn_cast<CmpInst>(V))
2941 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
2942 return false;
2943 if (auto *CI = dyn_cast<CastInst>(V))
2944 if (CI->getSrcTy()->getScalarType() !=
2945 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
2946 return false;
2947 if (auto *SI = dyn_cast<SelectInst>(V))
2948 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
2949 SI->getOperand(0)->getType() !=
2950 cast<SelectInst>(FrontV)->getOperand(0)->getType())
2951 return false;
2952 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
2953 return false;
2954 auto *II = dyn_cast<IntrinsicInst>(V);
2955 return !II || (isa<IntrinsicInst>(FrontV) &&
2956 II->getIntrinsicID() ==
2957 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
2958 !II->hasOperandBundles());
2959 };
2960 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
2961 // Check the operator is one that we support.
2962 if (isa<BinaryOperator, CmpInst>(FrontU)) {
2963 // We exclude div/rem in case they hit UB from poison lanes.
2964 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
2965 BO && BO->isIntDivRem())
2966 return false;
2969 continue;
2971 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
2973 continue;
2974 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
2975 // TODO: Handle vector widening/narrowing bitcasts.
2976 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
2977 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
2978 if (DstTy && SrcTy &&
2979 SrcTy->getNumElements() == DstTy->getNumElements()) {
2981 continue;
2982 }
2983 } else if (isa<SelectInst>(FrontU)) {
2987 continue;
2988 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
2989 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
2990 !II->hasOperandBundles()) {
2991 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
2992 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
2993 &TTI)) {
2994 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
2995 Value *FrontV = Item.front().first->get();
2996 Use *U = IL.first;
2997 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
2998 cast<Instruction>(FrontV)->getOperand(Op));
2999 }))
3000 return false;
3001 continue;
3002 }
3004 }
3005 continue;
3006 }
3007 }
3008
3009 if (isFreeConcat(Item, CostKind, TTI)) {
3010 ConcatLeafs.insert(FrontU);
3011 continue;
3012 }
3013
3014 return false;
3015 }
3016
3017 if (NumVisited <= 1)
3018 return false;
3019
3020 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3021
3022 // If we got this far, we know the shuffles are superfluous and can be
3023 // removed. Scan through again and generate the new tree of instructions.
3024 Builder.SetInsertPoint(&I);
3025 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3026 ConcatLeafs, Builder, &TTI);
3027 replaceValue(I, *V);
3028 return true;
3029}
3030
3031/// Given a commutative reduction, the order of the input lanes does not alter
3032/// the results. We can use this to remove certain shuffles feeding the
3033/// reduction, removing the need to shuffle at all.
3034bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3035 auto *II = dyn_cast<IntrinsicInst>(&I);
3036 if (!II)
3037 return false;
3038 switch (II->getIntrinsicID()) {
3039 case Intrinsic::vector_reduce_add:
3040 case Intrinsic::vector_reduce_mul:
3041 case Intrinsic::vector_reduce_and:
3042 case Intrinsic::vector_reduce_or:
3043 case Intrinsic::vector_reduce_xor:
3044 case Intrinsic::vector_reduce_smin:
3045 case Intrinsic::vector_reduce_smax:
3046 case Intrinsic::vector_reduce_umin:
3047 case Intrinsic::vector_reduce_umax:
3048 break;
3049 default:
3050 return false;
3051 }
3052
3053 // Find all the inputs when looking through operations that do not alter the
3054 // lane order (binops, for example). Currently we look for a single shuffle,
3055 // and can ignore splat values.
3056 std::queue<Value *> Worklist;
3058 ShuffleVectorInst *Shuffle = nullptr;
3059 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3060 Worklist.push(Op);
3061
3062 while (!Worklist.empty()) {
3063 Value *CV = Worklist.front();
3064 Worklist.pop();
3065 if (Visited.contains(CV))
3066 continue;
3067
3068 // Splats don't change the order, so can be safely ignored.
3069 if (isSplatValue(CV))
3070 continue;
3071
3072 Visited.insert(CV);
3073
3074 if (auto *CI = dyn_cast<Instruction>(CV)) {
3075 if (CI->isBinaryOp()) {
3076 for (auto *Op : CI->operand_values())
3077 Worklist.push(Op);
3078 continue;
3079 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3080 if (Shuffle && Shuffle != SV)
3081 return false;
3082 Shuffle = SV;
3083 continue;
3084 }
3085 }
3086
3087 // Anything else is currently an unknown node.
3088 return false;
3089 }
3090
3091 if (!Shuffle)
3092 return false;
3093
3094 // Check all uses of the binary ops and shuffles are also included in the
3095 // lane-invariant operations (Visited should be the list of lanewise
3096 // instructions, including the shuffle that we found).
3097 for (auto *V : Visited)
3098 for (auto *U : V->users())
3099 if (!Visited.contains(U) && U != &I)
3100 return false;
3101
3102 FixedVectorType *VecType =
3103 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3104 if (!VecType)
3105 return false;
3106 FixedVectorType *ShuffleInputType =
3107 dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
3108 if (!ShuffleInputType)
3109 return false;
3110 unsigned NumInputElts = ShuffleInputType->getNumElements();
3111
3112 // Find the mask from sorting the lanes into order. This is most likely to
3113 // become a identity or concat mask. Undef elements are pushed to the end.
3114 SmallVector<int> ConcatMask;
3115 Shuffle->getShuffleMask(ConcatMask);
3116 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3117 bool UsesSecondVec =
3118 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3119
3121 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3122 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3124 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3125 ShuffleInputType, ConcatMask, CostKind);
3126
3127 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3128 << "\n");
3129 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3130 << "\n");
3131 bool MadeChanges = false;
3132 if (NewCost < OldCost) {
3133 Builder.SetInsertPoint(Shuffle);
3134 Value *NewShuffle = Builder.CreateShuffleVector(
3135 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3136 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3137 replaceValue(*Shuffle, *NewShuffle);
3138 return true;
3139 }
3140
3141 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3142 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3143 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3144 return MadeChanges;
3145}
3146
3147/// For a given chain of patterns of the following form:
3148///
3149/// ```
3150/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3151///
3152/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3153/// ty1> %1)
3154/// OR
3155/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3156///
3157/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3158/// ...
3159/// ...
3160/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3161/// 3), <n x ty1> %(i - 2)
3162/// OR
3163/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3164///
3165/// %(i) = extractelement <n x ty1> %(i - 1), 0
3166/// ```
3167///
3168/// Where:
3169/// `mask` follows a partition pattern:
3170///
3171/// Ex:
3172/// [n = 8, p = poison]
3173///
3174/// 4 5 6 7 | p p p p
3175/// 2 3 | p p p p p p
3176/// 1 | p p p p p p p
3177///
3178/// For powers of 2, there's a consistent pattern, but for other cases
3179/// the parity of the current half value at each step decides the
3180/// next partition half (see `ExpectedParityMask` for more logical details
3181/// in generalising this).
3182///
3183/// Ex:
3184/// [n = 6]
3185///
3186/// 3 4 5 | p p p
3187/// 1 2 | p p p p
3188/// 1 | p p p p p
3189bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3190 // Going bottom-up for the pattern.
3191 std::queue<Value *> InstWorklist;
3192 InstructionCost OrigCost = 0;
3193
3194 // Common instruction operation after each shuffle op.
3195 std::optional<unsigned int> CommonCallOp = std::nullopt;
3196 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3197
3198 bool IsFirstCallOrBinInst = true;
3199 bool ShouldBeCallOrBinInst = true;
3200
3201 // This stores the last used instructions for shuffle/common op.
3202 //
3203 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3204 // instructions from either shuffle/common op.
3205 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3206
3207 Value *VecOpEE;
3208 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3209 return false;
3210
3211 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3212 if (!FVT)
3213 return false;
3214
3215 int64_t VecSize = FVT->getNumElements();
3216 if (VecSize < 2)
3217 return false;
3218
3219 // Number of levels would be ~log2(n), considering we always partition
3220 // by half for this fold pattern.
3221 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3222 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3223
3224 // This is how we generalise for all element sizes.
3225 // At each step, if vector size is odd, we need non-poison
3226 // values to cover the dominant half so we don't miss out on any element.
3227 //
3228 // This mask will help us retrieve this as we go from bottom to top:
3229 //
3230 // Mask Set -> N = N * 2 - 1
3231 // Mask Unset -> N = N * 2
3232 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3233 Cur = (Cur + 1) / 2, --Mask) {
3234 if (Cur & 1)
3235 ExpectedParityMask |= (1ll << Mask);
3236 }
3237
3238 InstWorklist.push(VecOpEE);
3239
3240 while (!InstWorklist.empty()) {
3241 Value *CI = InstWorklist.front();
3242 InstWorklist.pop();
3243
3244 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3245 if (!ShouldBeCallOrBinInst)
3246 return false;
3247
3248 if (!IsFirstCallOrBinInst &&
3249 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3250 return false;
3251
3252 // For the first found call/bin op, the vector has to come from the
3253 // extract element op.
3254 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3255 return false;
3256 IsFirstCallOrBinInst = false;
3257
3258 if (!CommonCallOp)
3259 CommonCallOp = II->getIntrinsicID();
3260 if (II->getIntrinsicID() != *CommonCallOp)
3261 return false;
3262
3263 switch (II->getIntrinsicID()) {
3264 case Intrinsic::umin:
3265 case Intrinsic::umax:
3266 case Intrinsic::smin:
3267 case Intrinsic::smax: {
3268 auto *Op0 = II->getOperand(0);
3269 auto *Op1 = II->getOperand(1);
3270 PrevVecV[0] = Op0;
3271 PrevVecV[1] = Op1;
3272 break;
3273 }
3274 default:
3275 return false;
3276 }
3277 ShouldBeCallOrBinInst ^= 1;
3278
3280 *CommonCallOp, II->getType(),
3281 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3282 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3283
3284 // We may need a swap here since it can be (a, b) or (b, a)
3285 // and accordingly change as we go up.
3286 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3287 std::swap(PrevVecV[0], PrevVecV[1]);
3288 InstWorklist.push(PrevVecV[1]);
3289 InstWorklist.push(PrevVecV[0]);
3290 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3291 // Similar logic for bin ops.
3292
3293 if (!ShouldBeCallOrBinInst)
3294 return false;
3295
3296 if (!IsFirstCallOrBinInst &&
3297 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3298 return false;
3299
3300 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3301 return false;
3302 IsFirstCallOrBinInst = false;
3303
3304 if (!CommonBinOp)
3305 CommonBinOp = BinOp->getOpcode();
3306
3307 if (BinOp->getOpcode() != *CommonBinOp)
3308 return false;
3309
3310 switch (*CommonBinOp) {
3311 case BinaryOperator::Add:
3312 case BinaryOperator::Mul:
3313 case BinaryOperator::Or:
3314 case BinaryOperator::And:
3315 case BinaryOperator::Xor: {
3316 auto *Op0 = BinOp->getOperand(0);
3317 auto *Op1 = BinOp->getOperand(1);
3318 PrevVecV[0] = Op0;
3319 PrevVecV[1] = Op1;
3320 break;
3321 }
3322 default:
3323 return false;
3324 }
3325 ShouldBeCallOrBinInst ^= 1;
3326
3327 OrigCost +=
3328 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3329
3330 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3331 std::swap(PrevVecV[0], PrevVecV[1]);
3332 InstWorklist.push(PrevVecV[1]);
3333 InstWorklist.push(PrevVecV[0]);
3334 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3335 // We shouldn't have any null values in the previous vectors,
3336 // is so, there was a mismatch in pattern.
3337 if (ShouldBeCallOrBinInst ||
3338 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3339 return false;
3340
3341 if (SVInst != PrevVecV[1])
3342 return false;
3343
3344 ArrayRef<int> CurMask;
3345 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3346 m_Mask(CurMask))))
3347 return false;
3348
3349 // Subtract the parity mask when checking the condition.
3350 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3351 if (Mask < ShuffleMaskHalf &&
3352 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3353 return false;
3354 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3355 return false;
3356 }
3357
3358 // Update mask values.
3359 ShuffleMaskHalf *= 2;
3360 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3361 ExpectedParityMask >>= 1;
3362
3364 SVInst->getType(), SVInst->getType(),
3365 CurMask, CostKind);
3366
3367 VisitedCnt += 1;
3368 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3369 break;
3370
3371 ShouldBeCallOrBinInst ^= 1;
3372 } else {
3373 return false;
3374 }
3375 }
3376
3377 // Pattern should end with a shuffle op.
3378 if (ShouldBeCallOrBinInst)
3379 return false;
3380
3381 assert(VecSize != -1 && "Expected Match for Vector Size");
3382
3383 Value *FinalVecV = PrevVecV[0];
3384 if (!FinalVecV)
3385 return false;
3386
3387 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3388
3389 Intrinsic::ID ReducedOp =
3390 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3391 : getReductionForBinop(*CommonBinOp));
3392 if (!ReducedOp)
3393 return false;
3394
3395 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3397
3398 if (NewCost >= OrigCost)
3399 return false;
3400
3401 auto *ReducedResult =
3402 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3403 replaceValue(I, *ReducedResult);
3404
3405 return true;
3406}
3407
3408/// Determine if its more efficient to fold:
3409/// reduce(trunc(x)) -> trunc(reduce(x)).
3410/// reduce(sext(x)) -> sext(reduce(x)).
3411/// reduce(zext(x)) -> zext(reduce(x)).
3412bool VectorCombine::foldCastFromReductions(Instruction &I) {
3413 auto *II = dyn_cast<IntrinsicInst>(&I);
3414 if (!II)
3415 return false;
3416
3417 bool TruncOnly = false;
3418 Intrinsic::ID IID = II->getIntrinsicID();
3419 switch (IID) {
3420 case Intrinsic::vector_reduce_add:
3421 case Intrinsic::vector_reduce_mul:
3422 TruncOnly = true;
3423 break;
3424 case Intrinsic::vector_reduce_and:
3425 case Intrinsic::vector_reduce_or:
3426 case Intrinsic::vector_reduce_xor:
3427 break;
3428 default:
3429 return false;
3430 }
3431
3432 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3433 Value *ReductionSrc = I.getOperand(0);
3434
3435 Value *Src;
3436 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3437 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3438 return false;
3439
3440 auto CastOpc =
3441 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3442
3443 auto *SrcTy = cast<VectorType>(Src->getType());
3444 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3445 Type *ResultTy = I.getType();
3446
3448 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3449 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3451 cast<CastInst>(ReductionSrc));
3452 InstructionCost NewCost =
3453 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3454 CostKind) +
3455 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3457
3458 if (OldCost <= NewCost || !NewCost.isValid())
3459 return false;
3460
3461 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3462 II->getIntrinsicID(), {Src});
3463 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3464 replaceValue(I, *NewCast);
3465 return true;
3466}
3467
3468/// Returns true if this ShuffleVectorInst eventually feeds into a
3469/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3470/// chains of shuffles and binary operators (in any combination/order).
3471/// The search does not go deeper than the given Depth.
3473 constexpr unsigned MaxVisited = 32;
3476 bool FoundReduction = false;
3477
3478 WorkList.push_back(SVI);
3479 while (!WorkList.empty()) {
3480 Instruction *I = WorkList.pop_back_val();
3481 for (User *U : I->users()) {
3482 auto *UI = cast<Instruction>(U);
3483 if (!UI || !Visited.insert(UI).second)
3484 continue;
3485 if (Visited.size() > MaxVisited)
3486 return false;
3487 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3488 // More than one reduction reached
3489 if (FoundReduction)
3490 return false;
3491 switch (II->getIntrinsicID()) {
3492 case Intrinsic::vector_reduce_add:
3493 case Intrinsic::vector_reduce_mul:
3494 case Intrinsic::vector_reduce_and:
3495 case Intrinsic::vector_reduce_or:
3496 case Intrinsic::vector_reduce_xor:
3497 case Intrinsic::vector_reduce_smin:
3498 case Intrinsic::vector_reduce_smax:
3499 case Intrinsic::vector_reduce_umin:
3500 case Intrinsic::vector_reduce_umax:
3501 FoundReduction = true;
3502 continue;
3503 default:
3504 return false;
3505 }
3506 }
3507
3508 if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
3509 return false;
3510
3511 WorkList.emplace_back(UI);
3512 }
3513 }
3514 return FoundReduction;
3515}
3516
3517/// This method looks for groups of shuffles acting on binops, of the form:
3518/// %x = shuffle ...
3519/// %y = shuffle ...
3520/// %a = binop %x, %y
3521/// %b = binop %x, %y
3522/// shuffle %a, %b, selectmask
3523/// We may, especially if the shuffle is wider than legal, be able to convert
3524/// the shuffle to a form where only parts of a and b need to be computed. On
3525/// architectures with no obvious "select" shuffle, this can reduce the total
3526/// number of operations if the target reports them as cheaper.
3527bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3528 auto *SVI = cast<ShuffleVectorInst>(&I);
3529 auto *VT = cast<FixedVectorType>(I.getType());
3530 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3531 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3532 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3533 VT != Op0->getType())
3534 return false;
3535
3536 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3537 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3538 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3539 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3540 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3541 auto checkSVNonOpUses = [&](Instruction *I) {
3542 if (!I || I->getOperand(0)->getType() != VT)
3543 return true;
3544 return any_of(I->users(), [&](User *U) {
3545 return U != Op0 && U != Op1 &&
3546 !(isa<ShuffleVectorInst>(U) &&
3547 (InputShuffles.contains(cast<Instruction>(U)) ||
3548 isInstructionTriviallyDead(cast<Instruction>(U))));
3549 });
3550 };
3551 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3552 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3553 return false;
3554
3555 // Collect all the uses that are shuffles that we can transform together. We
3556 // may not have a single shuffle, but a group that can all be transformed
3557 // together profitably.
3559 auto collectShuffles = [&](Instruction *I) {
3560 for (auto *U : I->users()) {
3561 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3562 if (!SV || SV->getType() != VT)
3563 return false;
3564 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3565 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3566 return false;
3567 if (!llvm::is_contained(Shuffles, SV))
3568 Shuffles.push_back(SV);
3569 }
3570 return true;
3571 };
3572 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3573 return false;
3574 // From a reduction, we need to be processing a single shuffle, otherwise the
3575 // other uses will not be lane-invariant.
3576 if (FromReduction && Shuffles.size() > 1)
3577 return false;
3578
3579 // Add any shuffle uses for the shuffles we have found, to include them in our
3580 // cost calculations.
3581 if (!FromReduction) {
3582 for (ShuffleVectorInst *SV : Shuffles) {
3583 for (auto *U : SV->users()) {
3584 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3585 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3586 Shuffles.push_back(SSV);
3587 }
3588 }
3589 }
3590
3591 // For each of the output shuffles, we try to sort all the first vector
3592 // elements to the beginning, followed by the second array elements at the
3593 // end. If the binops are legalized to smaller vectors, this may reduce total
3594 // number of binops. We compute the ReconstructMask mask needed to convert
3595 // back to the original lane order.
3597 SmallVector<SmallVector<int>> OrigReconstructMasks;
3598 int MaxV1Elt = 0, MaxV2Elt = 0;
3599 unsigned NumElts = VT->getNumElements();
3600 for (ShuffleVectorInst *SVN : Shuffles) {
3602 SVN->getShuffleMask(Mask);
3603
3604 // Check the operands are the same as the original, or reversed (in which
3605 // case we need to commute the mask).
3606 Value *SVOp0 = SVN->getOperand(0);
3607 Value *SVOp1 = SVN->getOperand(1);
3608 if (isa<UndefValue>(SVOp1)) {
3609 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3610 SVOp0 = SSV->getOperand(0);
3611 SVOp1 = SSV->getOperand(1);
3612 for (int &Elem : Mask) {
3613 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3614 return false;
3615 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3616 }
3617 }
3618 if (SVOp0 == Op1 && SVOp1 == Op0) {
3619 std::swap(SVOp0, SVOp1);
3621 }
3622 if (SVOp0 != Op0 || SVOp1 != Op1)
3623 return false;
3624
3625 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3626 // take the packed values from Op0/Op1 and reconstructing to the original
3627 // order.
3628 SmallVector<int> ReconstructMask;
3629 for (unsigned I = 0; I < Mask.size(); I++) {
3630 if (Mask[I] < 0) {
3631 ReconstructMask.push_back(-1);
3632 } else if (Mask[I] < static_cast<int>(NumElts)) {
3633 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3634 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3635 return Mask[I] == A.first;
3636 });
3637 if (It != V1.end())
3638 ReconstructMask.push_back(It - V1.begin());
3639 else {
3640 ReconstructMask.push_back(V1.size());
3641 V1.emplace_back(Mask[I], V1.size());
3642 }
3643 } else {
3644 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3645 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3646 return Mask[I] - static_cast<int>(NumElts) == A.first;
3647 });
3648 if (It != V2.end())
3649 ReconstructMask.push_back(NumElts + It - V2.begin());
3650 else {
3651 ReconstructMask.push_back(NumElts + V2.size());
3652 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3653 }
3654 }
3655 }
3656
3657 // For reductions, we know that the lane ordering out doesn't alter the
3658 // result. In-order can help simplify the shuffle away.
3659 if (FromReduction)
3660 sort(ReconstructMask);
3661 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3662 }
3663
3664 // If the Maximum element used from V1 and V2 are not larger than the new
3665 // vectors, the vectors are already packes and performing the optimization
3666 // again will likely not help any further. This also prevents us from getting
3667 // stuck in a cycle in case the costs do not also rule it out.
3668 if (V1.empty() || V2.empty() ||
3669 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3670 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3671 return false;
3672
3673 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3674 // shuffle of another shuffle, or not a shuffle (that is treated like a
3675 // identity shuffle).
3676 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3677 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3678 if (!SV)
3679 return M;
3680 if (isa<UndefValue>(SV->getOperand(1)))
3681 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3682 if (InputShuffles.contains(SSV))
3683 return SSV->getMaskValue(SV->getMaskValue(M));
3684 return SV->getMaskValue(M);
3685 };
3686
3687 // Attempt to sort the inputs my ascending mask values to make simpler input
3688 // shuffles and push complex shuffles down to the uses. We sort on the first
3689 // of the two input shuffle orders, to try and get at least one input into a
3690 // nice order.
3691 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3692 std::pair<int, int> Y) {
3693 int MXA = GetBaseMaskValue(A, X.first);
3694 int MYA = GetBaseMaskValue(A, Y.first);
3695 return MXA < MYA;
3696 };
3697 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3698 return SortBase(SVI0A, A, B);
3699 });
3700 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3701 return SortBase(SVI1A, A, B);
3702 });
3703 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3704 // modified order of the input shuffles.
3705 SmallVector<SmallVector<int>> ReconstructMasks;
3706 for (const auto &Mask : OrigReconstructMasks) {
3707 SmallVector<int> ReconstructMask;
3708 for (int M : Mask) {
3709 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3710 auto It = find_if(V, [M](auto A) { return A.second == M; });
3711 assert(It != V.end() && "Expected all entries in Mask");
3712 return std::distance(V.begin(), It);
3713 };
3714 if (M < 0)
3715 ReconstructMask.push_back(-1);
3716 else if (M < static_cast<int>(NumElts)) {
3717 ReconstructMask.push_back(FindIndex(V1, M));
3718 } else {
3719 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3720 }
3721 }
3722 ReconstructMasks.push_back(std::move(ReconstructMask));
3723 }
3724
3725 // Calculate the masks needed for the new input shuffles, which get padded
3726 // with undef
3727 SmallVector<int> V1A, V1B, V2A, V2B;
3728 for (unsigned I = 0; I < V1.size(); I++) {
3729 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3730 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3731 }
3732 for (unsigned I = 0; I < V2.size(); I++) {
3733 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3734 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3735 }
3736 while (V1A.size() < NumElts) {
3739 }
3740 while (V2A.size() < NumElts) {
3743 }
3744
3745 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
3746 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3747 if (!SV)
3748 return C;
3749 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
3752 VT, VT, SV->getShuffleMask(), CostKind);
3753 };
3754 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3755 return C +
3757 };
3758
3759 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
3760 unsigned MaxVectorSize =
3762 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
3763 // When there are multiple shufflevector operations on the same input,
3764 // especially when the vector length is larger than the register size,
3765 // identical shuffle patterns may occur across different groups of elements.
3766 // To avoid overestimating the cost by counting these repeated shuffles more
3767 // than once, we only account for unique shuffle patterns. This adjustment
3768 // prevents inflated costs in the cost model for wide vectors split into
3769 // several register-sized groups.
3770 std::set<SmallVector<int, 4>> UniqueShuffles;
3771 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3772 // Compute the cost for performing the shuffle over the full vector.
3773 auto ShuffleCost =
3775 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
3776 if (NumFullVectors < 2)
3777 return C + ShuffleCost;
3778 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
3779 unsigned NumUniqueGroups = 0;
3780 unsigned NumGroups = Mask.size() / MaxElementsInVector;
3781 // For each group of MaxElementsInVector contiguous elements,
3782 // collect their shuffle pattern and insert into the set of unique patterns.
3783 for (unsigned I = 0; I < NumFullVectors; ++I) {
3784 for (unsigned J = 0; J < MaxElementsInVector; ++J)
3785 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
3786 if (UniqueShuffles.insert(SubShuffle).second)
3787 NumUniqueGroups += 1;
3788 }
3789 return C + ShuffleCost * NumUniqueGroups / NumGroups;
3790 };
3791 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
3792 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3793 if (!SV)
3794 return C;
3796 SV->getShuffleMask(Mask);
3797 return AddShuffleMaskAdjustedCost(C, Mask);
3798 };
3799 // Check that input consists of ShuffleVectors applied to the same input
3800 auto AllShufflesHaveSameOperands =
3801 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
3802 if (InputShuffles.size() < 2)
3803 return false;
3804 ShuffleVectorInst *FirstSV =
3805 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
3806 if (!FirstSV)
3807 return false;
3808
3809 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
3810 return std::all_of(
3811 std::next(InputShuffles.begin()), InputShuffles.end(),
3812 [&](Instruction *I) {
3813 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
3814 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
3815 });
3816 };
3817
3818 // Get the costs of the shuffles + binops before and after with the new
3819 // shuffle masks.
3820 InstructionCost CostBefore =
3821 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
3822 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
3823 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
3824 InstructionCost(0), AddShuffleCost);
3825 if (AllShufflesHaveSameOperands(InputShuffles)) {
3826 UniqueShuffles.clear();
3827 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3828 InstructionCost(0), AddShuffleAdjustedCost);
3829 } else {
3830 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3831 InstructionCost(0), AddShuffleCost);
3832 }
3833
3834 // The new binops will be unused for lanes past the used shuffle lengths.
3835 // These types attempt to get the correct cost for that from the target.
3836 FixedVectorType *Op0SmallVT =
3837 FixedVectorType::get(VT->getScalarType(), V1.size());
3838 FixedVectorType *Op1SmallVT =
3839 FixedVectorType::get(VT->getScalarType(), V2.size());
3840 InstructionCost CostAfter =
3841 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
3842 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
3843 UniqueShuffles.clear();
3844 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
3845 InstructionCost(0), AddShuffleMaskAdjustedCost);
3846 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
3847 CostAfter +=
3848 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
3849 InstructionCost(0), AddShuffleMaskCost);
3850
3851 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
3852 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
3853 << " vs CostAfter: " << CostAfter << "\n");
3854 if (CostBefore < CostAfter ||
3855 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
3856 return false;
3857
3858 // The cost model has passed, create the new instructions.
3859 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
3860 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3861 if (!SV)
3862 return I;
3863 if (isa<UndefValue>(SV->getOperand(1)))
3864 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3865 if (InputShuffles.contains(SSV))
3866 return SSV->getOperand(Op);
3867 return SV->getOperand(Op);
3868 };
3869 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
3870 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
3871 GetShuffleOperand(SVI0A, 1), V1A);
3872 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
3873 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
3874 GetShuffleOperand(SVI0B, 1), V1B);
3875 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
3876 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
3877 GetShuffleOperand(SVI1A, 1), V2A);
3878 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
3879 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
3880 GetShuffleOperand(SVI1B, 1), V2B);
3881 Builder.SetInsertPoint(Op0);
3882 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
3883 NSV0A, NSV0B);
3884 if (auto *I = dyn_cast<Instruction>(NOp0))
3885 I->copyIRFlags(Op0, true);
3886 Builder.SetInsertPoint(Op1);
3887 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
3888 NSV1A, NSV1B);
3889 if (auto *I = dyn_cast<Instruction>(NOp1))
3890 I->copyIRFlags(Op1, true);
3891
3892 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
3893 Builder.SetInsertPoint(Shuffles[S]);
3894 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
3895 replaceValue(*Shuffles[S], *NSV, false);
3896 }
3897
3898 Worklist.pushValue(NSV0A);
3899 Worklist.pushValue(NSV0B);
3900 Worklist.pushValue(NSV1A);
3901 Worklist.pushValue(NSV1B);
3902 return true;
3903}
3904
3905/// Check if instruction depends on ZExt and this ZExt can be moved after the
3906/// instruction. Move ZExt if it is profitable. For example:
3907/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
3908/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
3909/// Cost model calculations takes into account if zext(x) has other users and
3910/// whether it can be propagated through them too.
3911bool VectorCombine::shrinkType(Instruction &I) {
3912 Value *ZExted, *OtherOperand;
3913 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
3914 m_Value(OtherOperand))) &&
3915 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
3916 return false;
3917
3918 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
3919
3920 auto *BigTy = cast<FixedVectorType>(I.getType());
3921 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
3922 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
3923
3924 if (I.getOpcode() == Instruction::LShr) {
3925 // Check that the shift amount is less than the number of bits in the
3926 // smaller type. Otherwise, the smaller lshr will return a poison value.
3927 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
3928 if (ShAmtKB.getMaxValue().uge(BW))
3929 return false;
3930 } else {
3931 // Check that the expression overall uses at most the same number of bits as
3932 // ZExted
3933 KnownBits KB = computeKnownBits(&I, *DL);
3934 if (KB.countMaxActiveBits() > BW)
3935 return false;
3936 }
3937
3938 // Calculate costs of leaving current IR as it is and moving ZExt operation
3939 // later, along with adding truncates if needed
3941 Instruction::ZExt, BigTy, SmallTy,
3942 TargetTransformInfo::CastContextHint::None, CostKind);
3943 InstructionCost CurrentCost = ZExtCost;
3944 InstructionCost ShrinkCost = 0;
3945
3946 // Calculate total cost and check that we can propagate through all ZExt users
3947 for (User *U : ZExtOperand->users()) {
3948 auto *UI = cast<Instruction>(U);
3949 if (UI == &I) {
3950 CurrentCost +=
3951 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
3952 ShrinkCost +=
3953 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
3954 ShrinkCost += ZExtCost;
3955 continue;
3956 }
3957
3958 if (!Instruction::isBinaryOp(UI->getOpcode()))
3959 return false;
3960
3961 // Check if we can propagate ZExt through its other users
3962 KnownBits KB = computeKnownBits(UI, *DL);
3963 if (KB.countMaxActiveBits() > BW)
3964 return false;
3965
3966 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
3967 ShrinkCost +=
3968 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
3969 ShrinkCost += ZExtCost;
3970 }
3971
3972 // If the other instruction operand is not a constant, we'll need to
3973 // generate a truncate instruction. So we have to adjust cost
3974 if (!isa<Constant>(OtherOperand))
3975 ShrinkCost += TTI.getCastInstrCost(
3976 Instruction::Trunc, SmallTy, BigTy,
3977 TargetTransformInfo::CastContextHint::None, CostKind);
3978
3979 // If the cost of shrinking types and leaving the IR is the same, we'll lean
3980 // towards modifying the IR because shrinking opens opportunities for other
3981 // shrinking optimisations.
3982 if (ShrinkCost > CurrentCost)
3983 return false;
3984
3985 Builder.SetInsertPoint(&I);
3986 Value *Op0 = ZExted;
3987 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
3988 // Keep the order of operands the same
3989 if (I.getOperand(0) == OtherOperand)
3990 std::swap(Op0, Op1);
3991 Value *NewBinOp =
3992 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
3993 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
3994 cast<Instruction>(NewBinOp)->copyMetadata(I);
3995 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
3996 replaceValue(I, *NewZExtr);
3997 return true;
3998}
3999
4000/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4001/// shuffle (DstVec, SrcVec, Mask)
4002bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4003 Value *DstVec, *SrcVec;
4004 uint64_t ExtIdx, InsIdx;
4005 if (!match(&I,
4006 m_InsertElt(m_Value(DstVec),
4007 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4008 m_ConstantInt(InsIdx))))
4009 return false;
4010
4011 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4012 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4013 // We can try combining vectors with different element sizes.
4014 if (!DstVecTy || !SrcVecTy ||
4015 SrcVecTy->getElementType() != DstVecTy->getElementType())
4016 return false;
4017
4018 unsigned NumDstElts = DstVecTy->getNumElements();
4019 unsigned NumSrcElts = SrcVecTy->getNumElements();
4020 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4021 return false;
4022
4023 // Insertion into poison is a cheaper single operand shuffle.
4026
4027 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4028 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4029 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4030 if (NeedDstSrcSwap) {
4032 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4033 Mask[InsIdx] = 0;
4034 else
4035 Mask[InsIdx] = ExtIdx;
4036 std::swap(DstVec, SrcVec);
4037 } else {
4039 std::iota(Mask.begin(), Mask.end(), 0);
4040 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4041 Mask[InsIdx] = NumDstElts;
4042 else
4043 Mask[InsIdx] = ExtIdx + NumDstElts;
4044 }
4045
4046 // Cost
4047 auto *Ins = cast<InsertElementInst>(&I);
4048 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4049 InstructionCost InsCost =
4050 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4051 InstructionCost ExtCost =
4052 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4053 InstructionCost OldCost = ExtCost + InsCost;
4054
4055 InstructionCost NewCost = 0;
4056 SmallVector<int> ExtToVecMask;
4057 if (!NeedExpOrNarrow) {
4058 // Ignore 'free' identity insertion shuffle.
4059 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4060 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4061 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4062 nullptr, {DstVec, SrcVec});
4063 } else {
4064 // When creating length-changing-vector, always create with a Mask whose
4065 // first element has an ExtIdx, so that the first element of the vector
4066 // being created is always the target to be extracted.
4067 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4068 if (IsExtIdxInBounds)
4069 ExtToVecMask[ExtIdx] = ExtIdx;
4070 else
4071 ExtToVecMask[0] = ExtIdx;
4072 // Add cost for expanding or narrowing
4074 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4075 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4076 }
4077
4078 if (!Ext->hasOneUse())
4079 NewCost += ExtCost;
4080
4081 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4082 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4083 << "\n");
4084
4085 if (OldCost < NewCost)
4086 return false;
4087
4088 if (NeedExpOrNarrow) {
4089 if (!NeedDstSrcSwap)
4090 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4091 else
4092 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4093 }
4094
4095 // Canonicalize undef param to RHS to help further folds.
4096 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4097 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4098 std::swap(DstVec, SrcVec);
4099 }
4100
4101 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4102 replaceValue(I, *Shuf);
4103
4104 return true;
4105}
4106
4107/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4108/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4109/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4110/// before casting it back into `<vscale x 16 x i32>`.
4111bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4112 const APInt *SplatVal0, *SplatVal1;
4113 if (!match(&I, m_Intrinsic<Intrinsic::vector_interleave2>(
4114 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4115 return false;
4116
4117 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4118 << "\n");
4119
4120 auto *VTy =
4121 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4122 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4123 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4124
4125 // Just in case the cost of interleave2 intrinsic and bitcast are both
4126 // invalid, in which case we want to bail out, we use <= rather
4127 // than < here. Even they both have valid and equal costs, it's probably
4128 // not a good idea to emit a high-cost constant splat.
4130 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4132 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4133 << *I.getType() << " is too high.\n");
4134 return false;
4135 }
4136
4137 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4138 NewSplatVal <<= Width;
4139 NewSplatVal |= SplatVal0->zext(Width * 2);
4140 auto *NewSplat = ConstantVector::getSplat(
4141 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4142
4143 IRBuilder<> Builder(&I);
4144 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4145 return true;
4146}
4147
4148// Attempt to shrink loads that are only used by shufflevector instructions.
4149bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4150 auto *OldLoad = dyn_cast<LoadInst>(&I);
4151 if (!OldLoad || !OldLoad->isSimple())
4152 return false;
4153
4154 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4155 if (!OldLoadTy)
4156 return false;
4157
4158 unsigned const OldNumElements = OldLoadTy->getNumElements();
4159
4160 // Search all uses of load. If all uses are shufflevector instructions, and
4161 // the second operands are all poison values, find the minimum and maximum
4162 // indices of the vector elements referenced by all shuffle masks.
4163 // Otherwise return `std::nullopt`.
4164 using IndexRange = std::pair<int, int>;
4165 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4166 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4167 for (llvm::Use &Use : I.uses()) {
4168 // Ensure all uses match the required pattern.
4169 User *Shuffle = Use.getUser();
4171
4172 if (!match(Shuffle,
4173 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4174 return std::nullopt;
4175
4176 // Ignore shufflevector instructions that have no uses.
4177 if (Shuffle->use_empty())
4178 continue;
4179
4180 // Find the min and max indices used by the shufflevector instruction.
4181 for (int Index : Mask) {
4182 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4183 OutputRange.first = std::min(Index, OutputRange.first);
4184 OutputRange.second = std::max(Index, OutputRange.second);
4185 }
4186 }
4187 }
4188
4189 if (OutputRange.second < OutputRange.first)
4190 return std::nullopt;
4191
4192 return OutputRange;
4193 };
4194
4195 // Get the range of vector elements used by shufflevector instructions.
4196 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4197 unsigned const NewNumElements = Indices->second + 1u;
4198
4199 // If the range of vector elements is smaller than the full load, attempt
4200 // to create a smaller load.
4201 if (NewNumElements < OldNumElements) {
4202 IRBuilder Builder(&I);
4203 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4204
4205 // Calculate costs of old and new ops.
4206 Type *ElemTy = OldLoadTy->getElementType();
4207 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4208 Value *PtrOp = OldLoad->getPointerOperand();
4209
4211 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4212 OldLoad->getPointerAddressSpace(), CostKind);
4213 InstructionCost NewCost =
4214 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4215 OldLoad->getPointerAddressSpace(), CostKind);
4216
4217 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4219 unsigned const MaxIndex = NewNumElements * 2u;
4220
4221 for (llvm::Use &Use : I.uses()) {
4222 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4223 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4224
4225 // Create entry for new use.
4226 NewUses.push_back({Shuffle, OldMask});
4227
4228 // Validate mask indices.
4229 for (int Index : OldMask) {
4230 if (Index >= static_cast<int>(MaxIndex))
4231 return false;
4232 }
4233
4234 // Update costs.
4235 OldCost +=
4237 OldLoadTy, OldMask, CostKind);
4238 NewCost +=
4240 NewLoadTy, OldMask, CostKind);
4241 }
4242
4243 LLVM_DEBUG(
4244 dbgs() << "Found a load used only by shufflevector instructions: "
4245 << I << "\n OldCost: " << OldCost
4246 << " vs NewCost: " << NewCost << "\n");
4247
4248 if (OldCost < NewCost || !NewCost.isValid())
4249 return false;
4250
4251 // Create new load of smaller vector.
4252 auto *NewLoad = cast<LoadInst>(
4253 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4254 NewLoad->copyMetadata(I);
4255
4256 // Replace all uses.
4257 for (UseEntry &Use : NewUses) {
4258 ShuffleVectorInst *Shuffle = Use.first;
4259 std::vector<int> &NewMask = Use.second;
4260
4261 Builder.SetInsertPoint(Shuffle);
4262 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4263 Value *NewShuffle = Builder.CreateShuffleVector(
4264 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4265
4266 replaceValue(*Shuffle, *NewShuffle, false);
4267 }
4268
4269 return true;
4270 }
4271 }
4272 return false;
4273}
4274
4275// Attempt to narrow a phi of shufflevector instructions where the two incoming
4276// values have the same operands but different masks. If the two shuffle masks
4277// are offsets of one another we can use one branch to rotate the incoming
4278// vector and perform one larger shuffle after the phi.
4279bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4280 auto *Phi = dyn_cast<PHINode>(&I);
4281 if (!Phi || Phi->getNumIncomingValues() != 2u)
4282 return false;
4283
4284 Value *Op = nullptr;
4285 ArrayRef<int> Mask0;
4286 ArrayRef<int> Mask1;
4287
4288 if (!match(Phi->getOperand(0u),
4289 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4290 !match(Phi->getOperand(1u),
4292 return false;
4293
4294 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4295
4296 // Ensure result vectors are wider than the argument vector.
4297 auto *InputVT = cast<FixedVectorType>(Op->getType());
4298 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4299 auto const InputNumElements = InputVT->getNumElements();
4300
4301 if (InputNumElements >= ResultVT->getNumElements())
4302 return false;
4303
4304 // Take the difference of the two shuffle masks at each index. Ignore poison
4305 // values at the same index in both masks.
4306 SmallVector<int, 16> NewMask;
4307 NewMask.reserve(Mask0.size());
4308
4309 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4310 if (M0 >= 0 && M1 >= 0)
4311 NewMask.push_back(M0 - M1);
4312 else if (M0 == -1 && M1 == -1)
4313 continue;
4314 else
4315 return false;
4316 }
4317
4318 // Ensure all elements of the new mask are equal. If the difference between
4319 // the incoming mask elements is the same, the two must be constant offsets
4320 // of one another.
4321 if (NewMask.empty() || !all_equal(NewMask))
4322 return false;
4323
4324 // Create new mask using difference of the two incoming masks.
4325 int MaskOffset = NewMask[0u];
4326 unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
4327 NewMask.clear();
4328
4329 for (unsigned I = 0u; I < InputNumElements; ++I) {
4330 NewMask.push_back(Index);
4331 Index = (Index + 1u) % InputNumElements;
4332 }
4333
4334 // Calculate costs for worst cases and compare.
4335 auto const Kind = TTI::SK_PermuteSingleSrc;
4336 auto OldCost =
4337 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4338 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4339 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4340 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4341
4342 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4343 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4344 << "\n");
4345
4346 if (NewCost > OldCost)
4347 return false;
4348
4349 // Create new shuffles and narrowed phi.
4350 auto Builder = IRBuilder(Shuf);
4351 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4352 auto *PoisonVal = PoisonValue::get(InputVT);
4353 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4354 Worklist.push(cast<Instruction>(NewShuf0));
4355
4356 Builder.SetInsertPoint(Phi);
4357 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4358 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4359 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4360 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4361
4362 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4363 PoisonVal = PoisonValue::get(NewPhi->getType());
4364 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4365
4366 replaceValue(*Phi, *NewShuf1);
4367 return true;
4368}
4369
4370/// This is the entry point for all transforms. Pass manager differences are
4371/// handled in the callers of this function.
4372bool VectorCombine::run() {
4374 return false;
4375
4376 // Don't attempt vectorization if the target does not support vectors.
4377 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4378 return false;
4379
4380 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4381
4382 auto FoldInst = [this](Instruction &I) {
4383 Builder.SetInsertPoint(&I);
4384 bool IsVectorType = isa<VectorType>(I.getType());
4385 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4386 auto Opcode = I.getOpcode();
4387
4388 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4389
4390 // These folds should be beneficial regardless of when this pass is run
4391 // in the optimization pipeline.
4392 // The type checking is for run-time efficiency. We can avoid wasting time
4393 // dispatching to folding functions if there's no chance of matching.
4394 if (IsFixedVectorType) {
4395 switch (Opcode) {
4396 case Instruction::InsertElement:
4397 if (vectorizeLoadInsert(I))
4398 return true;
4399 break;
4400 case Instruction::ShuffleVector:
4401 if (widenSubvectorLoad(I))
4402 return true;
4403 break;
4404 default:
4405 break;
4406 }
4407 }
4408
4409 // This transform works with scalable and fixed vectors
4410 // TODO: Identify and allow other scalable transforms
4411 if (IsVectorType) {
4412 if (scalarizeOpOrCmp(I))
4413 return true;
4414 if (scalarizeLoadExtract(I))
4415 return true;
4416 if (scalarizeExtExtract(I))
4417 return true;
4418 if (scalarizeVPIntrinsic(I))
4419 return true;
4420 if (foldInterleaveIntrinsics(I))
4421 return true;
4422 }
4423
4424 if (Opcode == Instruction::Store)
4425 if (foldSingleElementStore(I))
4426 return true;
4427
4428 // If this is an early pipeline invocation of this pass, we are done.
4429 if (TryEarlyFoldsOnly)
4430 return false;
4431
4432 // Otherwise, try folds that improve codegen but may interfere with
4433 // early IR canonicalizations.
4434 // The type checking is for run-time efficiency. We can avoid wasting time
4435 // dispatching to folding functions if there's no chance of matching.
4436 if (IsFixedVectorType) {
4437 switch (Opcode) {
4438 case Instruction::InsertElement:
4439 if (foldInsExtFNeg(I))
4440 return true;
4441 if (foldInsExtBinop(I))
4442 return true;
4443 if (foldInsExtVectorToShuffle(I))
4444 return true;
4445 break;
4446 case Instruction::ShuffleVector:
4447 if (foldPermuteOfBinops(I))
4448 return true;
4449 if (foldShuffleOfBinops(I))
4450 return true;
4451 if (foldShuffleOfSelects(I))
4452 return true;
4453 if (foldShuffleOfCastops(I))
4454 return true;
4455 if (foldShuffleOfShuffles(I))
4456 return true;
4457 if (foldShuffleOfIntrinsics(I))
4458 return true;
4459 if (foldSelectShuffle(I))
4460 return true;
4461 if (foldShuffleToIdentity(I))
4462 return true;
4463 break;
4464 case Instruction::Load:
4465 if (shrinkLoadForShuffles(I))
4466 return true;
4467 break;
4468 case Instruction::BitCast:
4469 if (foldBitcastShuffle(I))
4470 return true;
4471 break;
4472 case Instruction::And:
4473 case Instruction::Or:
4474 case Instruction::Xor:
4475 if (foldBitOpOfCastops(I))
4476 return true;
4477 break;
4478 case Instruction::PHI:
4479 if (shrinkPhiOfShuffles(I))
4480 return true;
4481 break;
4482 default:
4483 if (shrinkType(I))
4484 return true;
4485 break;
4486 }
4487 } else {
4488 switch (Opcode) {
4489 case Instruction::Call:
4490 if (foldShuffleFromReductions(I))
4491 return true;
4492 if (foldCastFromReductions(I))
4493 return true;
4494 break;
4495 case Instruction::ExtractElement:
4496 if (foldShuffleChainsToReduce(I))
4497 return true;
4498 break;
4499 case Instruction::ICmp:
4500 case Instruction::FCmp:
4501 if (foldExtractExtract(I))
4502 return true;
4503 break;
4504 case Instruction::Or:
4505 if (foldConcatOfBoolMasks(I))
4506 return true;
4507 [[fallthrough]];
4508 default:
4509 if (Instruction::isBinaryOp(Opcode)) {
4510 if (foldExtractExtract(I))
4511 return true;
4512 if (foldExtractedCmps(I))
4513 return true;
4514 if (foldBinopOfReductions(I))
4515 return true;
4516 }
4517 break;
4518 }
4519 }
4520 return false;
4521 };
4522
4523 bool MadeChange = false;
4524 for (BasicBlock &BB : F) {
4525 // Ignore unreachable basic blocks.
4526 if (!DT.isReachableFromEntry(&BB))
4527 continue;
4528 // Use early increment range so that we can erase instructions in loop.
4529 // make_early_inc_range is not applicable here, as the next iterator may
4530 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4531 // We manually maintain the next instruction and update it when it is about
4532 // to be deleted.
4533 Instruction *I = &BB.front();
4534 while (I) {
4535 NextInst = I->getNextNode();
4536 if (!I->isDebugOrPseudoInst())
4537 MadeChange |= FoldInst(*I);
4538 I = NextInst;
4539 }
4540 }
4541
4542 NextInst = nullptr;
4543
4544 while (!Worklist.isEmpty()) {
4545 Instruction *I = Worklist.removeOne();
4546 if (!I)
4547 continue;
4548
4551 continue;
4552 }
4553
4554 MadeChange |= FoldInst(*I);
4555 }
4556
4557 return MadeChange;
4558}
4559
4562 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4566 const DataLayout *DL = &F.getDataLayout();
4567 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4568 TryEarlyFoldsOnly);
4569 if (!Combiner.run())
4570 return PreservedAnalyses::all();
4573 return PA;
4574}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1451
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Value * peekThroughBitcasts(Value *V)
Return the source operand of a potentially bitcasted value.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
A private abstract base class describing the concept of an individual alias analysis implementation.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
BinaryOps getOpcode() const
Definition: InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
bool isFPPredicate() const
Definition: InstrTypes.h:784
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition: Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
Definition: Constants.cpp:2564
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
This class represents a range of values.
Definition: ConstantRange.h:47
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1474
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
iterator end()
Definition: DenseMap.h:87
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:334
This instruction extracts a single (scalar) element from a VectorType value.
This class represents a cast from floating point to signed integer.
This class represents a cast from floating point to unsigned integer.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition: DerivedTypes.h:627
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1010
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1115
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2637
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2238
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1931
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2263
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1809
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Definition: IRBuilder.cpp:922
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1860
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1795
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
InstructionWorklist - This is the worklist management logic for InstCombine and other simplification ...
void pushUsersToWorkList(Instruction &I)
When an instruction is simplified, add all users of the instruction to the work lists because they mi...
void push(Instruction *I)
Push the instruction onto the worklist stack.
void remove(Instruction *I)
Remove I from the worklist if it exists.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
Definition: Instruction.h:317
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
Definition: Instruction.h:318
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:56
An instruction for reading from memory.
Definition: Instructions.h:180
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
This class represents a sign extension of integer types.
This class represents a cast from signed integer to floating point.
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
void setAlignment(Align Align)
Definition: Instructions.h:342
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
LLVM_ABI InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
op_range operands()
Definition: User.h:292
Value * getOperand(unsigned i) const
Definition: User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
This is the common base class for vector predication intrinsics.
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:953
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150
bool use_empty() const
Definition: Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
This class represents zero extension of integer types.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
Definition: Intrinsics.cpp:743
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
Definition: PatternMatch.h:160
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:165
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:245
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1737
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:355
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:975
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:252
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition: Loads.cpp:416
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition: VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition: Loads.cpp:431
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1393
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
Definition: LoopUtils.cpp:1005
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:376
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2127
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
Definition: LoopUtils.cpp:959
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:138
Matching combinators.