LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoadExtract(Instruction &I);
133 bool scalarizeExtExtract(Instruction &I);
134 bool foldConcatOfBoolMasks(Instruction &I);
135 bool foldPermuteOfBinops(Instruction &I);
136 bool foldShuffleOfBinops(Instruction &I);
137 bool foldShuffleOfSelects(Instruction &I);
138 bool foldShuffleOfCastops(Instruction &I);
139 bool foldShuffleOfShuffles(Instruction &I);
140 bool foldShuffleOfIntrinsics(Instruction &I);
141 bool foldShuffleToIdentity(Instruction &I);
142 bool foldShuffleFromReductions(Instruction &I);
143 bool foldShuffleChainsToReduce(Instruction &I);
144 bool foldCastFromReductions(Instruction &I);
145 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
146 bool foldInterleaveIntrinsics(Instruction &I);
147 bool shrinkType(Instruction &I);
148 bool shrinkLoadForShuffles(Instruction &I);
149 bool shrinkPhiOfShuffles(Instruction &I);
150
151 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
152 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
153 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
154 Old.replaceAllUsesWith(&New);
155 if (auto *NewI = dyn_cast<Instruction>(&New)) {
156 New.takeName(&Old);
157 Worklist.pushUsersToWorkList(*NewI);
158 Worklist.pushValue(NewI);
159 }
160 if (Erase && isInstructionTriviallyDead(&Old)) {
161 eraseInstruction(Old);
162 } else {
163 Worklist.push(&Old);
164 }
165 }
166
167 void eraseInstruction(Instruction &I) {
168 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
169 SmallVector<Value *> Ops(I.operands());
170 Worklist.remove(&I);
171 I.eraseFromParent();
172
173 // Push remaining users of the operands and then the operand itself - allows
174 // further folds that were hindered by OneUse limits.
175 SmallPtrSet<Value *, 4> Visited;
176 for (Value *Op : Ops) {
177 if (!Visited.contains(Op)) {
178 if (auto *OpI = dyn_cast<Instruction>(Op)) {
180 OpI, nullptr, nullptr, [&](Value *V) {
181 if (auto *I = dyn_cast<Instruction>(V)) {
182 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
183 Worklist.remove(I);
184 if (I == NextInst)
185 NextInst = NextInst->getNextNode();
186 Visited.insert(I);
187 }
188 }))
189 continue;
190 Worklist.pushUsersToWorkList(*OpI);
191 Worklist.pushValue(OpI);
192 }
193 }
194 }
195 }
196};
197} // namespace
198
199/// Return the source operand of a potentially bitcasted value. If there is no
200/// bitcast, return the input value itself.
202 while (auto *BitCast = dyn_cast<BitCastInst>(V))
203 V = BitCast->getOperand(0);
204 return V;
205}
206
207static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
208 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
209 // The widened load may load data from dirty regions or create data races
210 // non-existent in the source.
211 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
212 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
214 return false;
215
216 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
217 // sure we have all of our type-based constraints in place for this target.
218 Type *ScalarTy = Load->getType()->getScalarType();
219 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
220 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
221 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
222 ScalarSize % 8 != 0)
223 return false;
224
225 return true;
226}
227
228bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
229 // Match insert into fixed vector of scalar value.
230 // TODO: Handle non-zero insert index.
231 Value *Scalar;
232 if (!match(&I,
234 return false;
235
236 // Optionally match an extract from another vector.
237 Value *X;
238 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
239 if (!HasExtract)
240 X = Scalar;
241
242 auto *Load = dyn_cast<LoadInst>(X);
243 if (!canWidenLoad(Load, TTI))
244 return false;
245
246 Type *ScalarTy = Scalar->getType();
247 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
248 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
249
250 // Check safety of replacing the scalar load with a larger vector load.
251 // We use minimal alignment (maximum flexibility) because we only care about
252 // the dereferenceable region. When calculating cost and creating a new op,
253 // we may use a larger value based on alignment attributes.
254 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
255 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
256
257 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
258 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
259 unsigned OffsetEltIndex = 0;
260 Align Alignment = Load->getAlign();
261 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
262 &DT)) {
263 // It is not safe to load directly from the pointer, but we can still peek
264 // through gep offsets and check if it safe to load from a base address with
265 // updated alignment. If it is, we can shuffle the element(s) into place
266 // after loading.
267 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
268 APInt Offset(OffsetBitWidth, 0);
270
271 // We want to shuffle the result down from a high element of a vector, so
272 // the offset must be positive.
273 if (Offset.isNegative())
274 return false;
275
276 // The offset must be a multiple of the scalar element to shuffle cleanly
277 // in the element's size.
278 uint64_t ScalarSizeInBytes = ScalarSize / 8;
279 if (Offset.urem(ScalarSizeInBytes) != 0)
280 return false;
281
282 // If we load MinVecNumElts, will our target element still be loaded?
283 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
284 if (OffsetEltIndex >= MinVecNumElts)
285 return false;
286
287 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
288 &DT))
289 return false;
290
291 // Update alignment with offset value. Note that the offset could be negated
292 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
293 // negation does not change the result of the alignment calculation.
294 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
295 }
296
297 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
298 // Use the greater of the alignment on the load or its source pointer.
299 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
300 Type *LoadTy = Load->getType();
301 unsigned AS = Load->getPointerAddressSpace();
302 InstructionCost OldCost =
303 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
304 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
305 OldCost +=
306 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
307 /* Insert */ true, HasExtract, CostKind);
308
309 // New pattern: load VecPtr
310 InstructionCost NewCost =
311 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
312 // Optionally, we are shuffling the loaded vector element(s) into place.
313 // For the mask set everything but element 0 to undef to prevent poison from
314 // propagating from the extra loaded memory. This will also optionally
315 // shrink/grow the vector from the loaded size to the output size.
316 // We assume this operation has no cost in codegen if there was no offset.
317 // Note that we could use freeze to avoid poison problems, but then we might
318 // still need a shuffle to change the vector size.
319 auto *Ty = cast<FixedVectorType>(I.getType());
320 unsigned OutputNumElts = Ty->getNumElements();
321 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
322 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
323 Mask[0] = OffsetEltIndex;
324 if (OffsetEltIndex)
325 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
326 CostKind);
327
328 // We can aggressively convert to the vector form because the backend can
329 // invert this transform if it does not result in a performance win.
330 if (OldCost < NewCost || !NewCost.isValid())
331 return false;
332
333 // It is safe and potentially profitable to load a vector directly:
334 // inselt undef, load Scalar, 0 --> load VecPtr
335 IRBuilder<> Builder(Load);
336 Value *CastedPtr =
337 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
338 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
339 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
340
341 replaceValue(I, *VecLd);
342 ++NumVecLoad;
343 return true;
344}
345
346/// If we are loading a vector and then inserting it into a larger vector with
347/// undefined elements, try to load the larger vector and eliminate the insert.
348/// This removes a shuffle in IR and may allow combining of other loaded values.
349bool VectorCombine::widenSubvectorLoad(Instruction &I) {
350 // Match subvector insert of fixed vector.
351 auto *Shuf = cast<ShuffleVectorInst>(&I);
352 if (!Shuf->isIdentityWithPadding())
353 return false;
354
355 // Allow a non-canonical shuffle mask that is choosing elements from op1.
356 unsigned NumOpElts =
357 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
358 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
359 return M >= (int)(NumOpElts);
360 });
361
362 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
363 if (!canWidenLoad(Load, TTI))
364 return false;
365
366 // We use minimal alignment (maximum flexibility) because we only care about
367 // the dereferenceable region. When calculating cost and creating a new op,
368 // we may use a larger value based on alignment attributes.
369 auto *Ty = cast<FixedVectorType>(I.getType());
370 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
371 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
372 Align Alignment = Load->getAlign();
373 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
374 return false;
375
376 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
377 Type *LoadTy = Load->getType();
378 unsigned AS = Load->getPointerAddressSpace();
379
380 // Original pattern: insert_subvector (load PtrOp)
381 // This conservatively assumes that the cost of a subvector insert into an
382 // undef value is 0. We could add that cost if the cost model accurately
383 // reflects the real cost of that operation.
384 InstructionCost OldCost =
385 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
386
387 // New pattern: load PtrOp
388 InstructionCost NewCost =
389 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
390
391 // We can aggressively convert to the vector form because the backend can
392 // invert this transform if it does not result in a performance win.
393 if (OldCost < NewCost || !NewCost.isValid())
394 return false;
395
396 IRBuilder<> Builder(Load);
397 Value *CastedPtr =
398 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
399 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
400 replaceValue(I, *VecLd);
401 ++NumVecLoad;
402 return true;
403}
404
405/// Determine which, if any, of the inputs should be replaced by a shuffle
406/// followed by extract from a different index.
407ExtractElementInst *VectorCombine::getShuffleExtract(
408 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
409 unsigned PreferredExtractIndex = InvalidIndex) const {
410 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
411 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
412 assert(Index0C && Index1C && "Expected constant extract indexes");
413
414 unsigned Index0 = Index0C->getZExtValue();
415 unsigned Index1 = Index1C->getZExtValue();
416
417 // If the extract indexes are identical, no shuffle is needed.
418 if (Index0 == Index1)
419 return nullptr;
420
421 Type *VecTy = Ext0->getVectorOperand()->getType();
422 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
423 InstructionCost Cost0 =
424 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
425 InstructionCost Cost1 =
426 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
427
428 // If both costs are invalid no shuffle is needed
429 if (!Cost0.isValid() && !Cost1.isValid())
430 return nullptr;
431
432 // We are extracting from 2 different indexes, so one operand must be shuffled
433 // before performing a vector operation and/or extract. The more expensive
434 // extract will be replaced by a shuffle.
435 if (Cost0 > Cost1)
436 return Ext0;
437 if (Cost1 > Cost0)
438 return Ext1;
439
440 // If the costs are equal and there is a preferred extract index, shuffle the
441 // opposite operand.
442 if (PreferredExtractIndex == Index0)
443 return Ext1;
444 if (PreferredExtractIndex == Index1)
445 return Ext0;
446
447 // Otherwise, replace the extract with the higher index.
448 return Index0 > Index1 ? Ext0 : Ext1;
449}
450
451/// Compare the relative costs of 2 extracts followed by scalar operation vs.
452/// vector operation(s) followed by extract. Return true if the existing
453/// instructions are cheaper than a vector alternative. Otherwise, return false
454/// and if one of the extracts should be transformed to a shufflevector, set
455/// \p ConvertToShuffle to that extract instruction.
456bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
457 ExtractElementInst *Ext1,
458 const Instruction &I,
459 ExtractElementInst *&ConvertToShuffle,
460 unsigned PreferredExtractIndex) {
461 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
462 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
463 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
464
465 unsigned Opcode = I.getOpcode();
466 Value *Ext0Src = Ext0->getVectorOperand();
467 Value *Ext1Src = Ext1->getVectorOperand();
468 Type *ScalarTy = Ext0->getType();
469 auto *VecTy = cast<VectorType>(Ext0Src->getType());
470 InstructionCost ScalarOpCost, VectorOpCost;
471
472 // Get cost estimates for scalar and vector versions of the operation.
473 bool IsBinOp = Instruction::isBinaryOp(Opcode);
474 if (IsBinOp) {
475 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
476 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
477 } else {
478 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
479 "Expected a compare");
480 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
481 ScalarOpCost = TTI.getCmpSelInstrCost(
482 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
483 VectorOpCost = TTI.getCmpSelInstrCost(
484 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
485 }
486
487 // Get cost estimates for the extract elements. These costs will factor into
488 // both sequences.
489 unsigned Ext0Index = Ext0IndexC->getZExtValue();
490 unsigned Ext1Index = Ext1IndexC->getZExtValue();
491
492 InstructionCost Extract0Cost =
493 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
494 InstructionCost Extract1Cost =
495 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
496
497 // A more expensive extract will always be replaced by a splat shuffle.
498 // For example, if Ext0 is more expensive:
499 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
500 // extelt (opcode (splat V0, Ext0), V1), Ext1
501 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
502 // check the cost of creating a broadcast shuffle and shuffling both
503 // operands to element 0.
504 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
505 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
506 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
507
508 // Extra uses of the extracts mean that we include those costs in the
509 // vector total because those instructions will not be eliminated.
510 InstructionCost OldCost, NewCost;
511 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
512 // Handle a special case. If the 2 extracts are identical, adjust the
513 // formulas to account for that. The extra use charge allows for either the
514 // CSE'd pattern or an unoptimized form with identical values:
515 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
516 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
517 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
518 OldCost = CheapExtractCost + ScalarOpCost;
519 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
520 } else {
521 // Handle the general case. Each extract is actually a different value:
522 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
523 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
524 NewCost = VectorOpCost + CheapExtractCost +
525 !Ext0->hasOneUse() * Extract0Cost +
526 !Ext1->hasOneUse() * Extract1Cost;
527 }
528
529 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
530 if (ConvertToShuffle) {
531 if (IsBinOp && DisableBinopExtractShuffle)
532 return true;
533
534 // If we are extracting from 2 different indexes, then one operand must be
535 // shuffled before performing the vector operation. The shuffle mask is
536 // poison except for 1 lane that is being translated to the remaining
537 // extraction lane. Therefore, it is a splat shuffle. Ex:
538 // ShufMask = { poison, poison, 0, poison }
539 // TODO: The cost model has an option for a "broadcast" shuffle
540 // (splat-from-element-0), but no option for a more general splat.
541 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
542 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
544 ShuffleMask[BestInsIndex] = BestExtIndex;
546 VecTy, VecTy, ShuffleMask, CostKind, 0,
547 nullptr, {ConvertToShuffle});
548 } else {
550 VecTy, VecTy, {}, CostKind, 0, nullptr,
551 {ConvertToShuffle});
552 }
553 }
554
555 // Aggressively form a vector op if the cost is equal because the transform
556 // may enable further optimization.
557 // Codegen can reverse this transform (scalarize) if it was not profitable.
558 return OldCost < NewCost;
559}
560
561/// Create a shuffle that translates (shifts) 1 element from the input vector
562/// to a new element location.
563static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
564 unsigned NewIndex, IRBuilderBase &Builder) {
565 // The shuffle mask is poison except for 1 lane that is being translated
566 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
567 // ShufMask = { 2, poison, poison, poison }
568 auto *VecTy = cast<FixedVectorType>(Vec->getType());
569 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
570 ShufMask[NewIndex] = OldIndex;
571 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
572}
573
574/// Given an extract element instruction with constant index operand, shuffle
575/// the source vector (shift the scalar element) to a NewIndex for extraction.
576/// Return null if the input can be constant folded, so that we are not creating
577/// unnecessary instructions.
578static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
579 IRBuilderBase &Builder) {
580 // Shufflevectors can only be created for fixed-width vectors.
581 Value *X = ExtElt->getVectorOperand();
582 if (!isa<FixedVectorType>(X->getType()))
583 return nullptr;
584
585 // If the extract can be constant-folded, this code is unsimplified. Defer
586 // to other passes to handle that.
587 Value *C = ExtElt->getIndexOperand();
588 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
589 if (isa<Constant>(X))
590 return nullptr;
591
592 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
593 NewIndex, Builder);
594 return Shuf;
595}
596
597/// Try to reduce extract element costs by converting scalar compares to vector
598/// compares followed by extract.
599/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
600Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
601 Instruction &I) {
602 assert(isa<CmpInst>(&I) && "Expected a compare");
603
604 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
605 // --> extelt (cmp Pred V0, V1), ExtIndex
606 ++NumVecCmp;
607 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
608 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
609 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
610}
611
612/// Try to reduce extract element costs by converting scalar binops to vector
613/// binops followed by extract.
614/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
615Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
616 Instruction &I) {
617 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
618
619 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
620 // --> extelt (bo V0, V1), ExtIndex
621 ++NumVecBO;
622 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
623 V1, "foldExtExtBinop");
624
625 // All IR flags are safe to back-propagate because any potential poison
626 // created in unused vector elements is discarded by the extract.
627 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
628 VecBOInst->copyIRFlags(&I);
629
630 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
631}
632
633/// Match an instruction with extracted vector operands.
634bool VectorCombine::foldExtractExtract(Instruction &I) {
635 // It is not safe to transform things like div, urem, etc. because we may
636 // create undefined behavior when executing those on unknown vector elements.
638 return false;
639
640 Instruction *I0, *I1;
641 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
642 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
644 return false;
645
646 Value *V0, *V1;
647 uint64_t C0, C1;
648 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
649 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
650 V0->getType() != V1->getType())
651 return false;
652
653 // If the scalar value 'I' is going to be re-inserted into a vector, then try
654 // to create an extract to that same element. The extract/insert can be
655 // reduced to a "select shuffle".
656 // TODO: If we add a larger pattern match that starts from an insert, this
657 // probably becomes unnecessary.
658 auto *Ext0 = cast<ExtractElementInst>(I0);
659 auto *Ext1 = cast<ExtractElementInst>(I1);
660 uint64_t InsertIndex = InvalidIndex;
661 if (I.hasOneUse())
662 match(I.user_back(),
663 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
664
665 ExtractElementInst *ExtractToChange;
666 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
667 return false;
668
669 Value *ExtOp0 = Ext0->getVectorOperand();
670 Value *ExtOp1 = Ext1->getVectorOperand();
671
672 if (ExtractToChange) {
673 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
674 Value *NewExtOp =
675 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
676 if (!NewExtOp)
677 return false;
678 if (ExtractToChange == Ext0)
679 ExtOp0 = NewExtOp;
680 else
681 ExtOp1 = NewExtOp;
682 }
683
684 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
685 : Ext0->getIndexOperand();
686 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
687 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
688 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
689 Worklist.push(Ext0);
690 Worklist.push(Ext1);
691 replaceValue(I, *NewExt);
692 return true;
693}
694
695/// Try to replace an extract + scalar fneg + insert with a vector fneg +
696/// shuffle.
697bool VectorCombine::foldInsExtFNeg(Instruction &I) {
698 // Match an insert (op (extract)) pattern.
699 Value *DestVec;
700 uint64_t Index;
701 Instruction *FNeg;
702 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
703 m_ConstantInt(Index))))
704 return false;
705
706 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
707 Value *SrcVec;
708 Instruction *Extract;
709 if (!match(FNeg, m_FNeg(m_CombineAnd(
710 m_Instruction(Extract),
711 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
712 return false;
713
714 auto *VecTy = cast<FixedVectorType>(I.getType());
715 auto *ScalarTy = VecTy->getScalarType();
716 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
717 if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
718 return false;
719
720 // Ignore bogus insert/extract index.
721 unsigned NumElts = VecTy->getNumElements();
722 if (Index >= NumElts)
723 return false;
724
725 // We are inserting the negated element into the same lane that we extracted
726 // from. This is equivalent to a select-shuffle that chooses all but the
727 // negated element from the destination vector.
728 SmallVector<int> Mask(NumElts);
729 std::iota(Mask.begin(), Mask.end(), 0);
730 Mask[Index] = Index + NumElts;
731 InstructionCost OldCost =
732 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
733 TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
734
735 // If the extract has one use, it will be eliminated, so count it in the
736 // original cost. If it has more than one use, ignore the cost because it will
737 // be the same before/after.
738 if (Extract->hasOneUse())
739 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
740
741 InstructionCost NewCost =
742 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
744 Mask, CostKind);
745
746 bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
747 // If the lengths of the two vectors are not equal,
748 // we need to add a length-change vector. Add this cost.
749 SmallVector<int> SrcMask;
750 if (NeedLenChg) {
751 SrcMask.assign(NumElts, PoisonMaskElem);
752 SrcMask[Index] = Index;
754 VecTy, SrcVecTy, SrcMask, CostKind);
755 }
756
757 if (NewCost > OldCost)
758 return false;
759
760 Value *NewShuf;
761 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
762 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
763 if (NeedLenChg) {
764 // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
765 Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
766 NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
767 } else {
768 // shuffle DestVec, (fneg SrcVec), Mask
769 NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
770 }
771
772 replaceValue(I, *NewShuf);
773 return true;
774}
775
776/// Try to fold insert(binop(x,y),binop(a,b),idx)
777/// --> binop(insert(x,a,idx),insert(y,b,idx))
778bool VectorCombine::foldInsExtBinop(Instruction &I) {
779 BinaryOperator *VecBinOp, *SclBinOp;
780 uint64_t Index;
781 if (!match(&I,
782 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
783 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
784 return false;
785
786 // TODO: Add support for addlike etc.
787 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
788 if (BinOpcode != SclBinOp->getOpcode())
789 return false;
790
791 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
792 if (!ResultTy)
793 return false;
794
795 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
796 // shuffle?
797
799 TTI.getInstructionCost(VecBinOp, CostKind) +
801 InstructionCost NewCost =
802 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
803 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
804 Index, VecBinOp->getOperand(0),
805 SclBinOp->getOperand(0)) +
806 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
807 Index, VecBinOp->getOperand(1),
808 SclBinOp->getOperand(1));
809
810 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
811 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
812 << "\n");
813 if (NewCost > OldCost)
814 return false;
815
816 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
817 SclBinOp->getOperand(0), Index);
818 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1), Index);
820 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
821
822 // Intersect flags from the old binops.
823 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
824 NewInst->copyIRFlags(VecBinOp);
825 NewInst->andIRFlags(SclBinOp);
826 }
827
828 Worklist.pushValue(NewIns0);
829 Worklist.pushValue(NewIns1);
830 replaceValue(I, *NewBO);
831 return true;
832}
833
834/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
835/// Supports: bitcast, trunc, sext, zext
836bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
837 // Check if this is a bitwise logic operation
838 auto *BinOp = dyn_cast<BinaryOperator>(&I);
839 if (!BinOp || !BinOp->isBitwiseLogicOp())
840 return false;
841
842 // Get the cast instructions
843 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
844 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
845 if (!LHSCast || !RHSCast) {
846 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
847 return false;
848 }
849
850 // Both casts must be the same type
851 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
852 if (CastOpcode != RHSCast->getOpcode())
853 return false;
854
855 // Only handle supported cast operations
856 switch (CastOpcode) {
857 case Instruction::BitCast:
858 case Instruction::Trunc:
859 case Instruction::SExt:
860 case Instruction::ZExt:
861 break;
862 default:
863 return false;
864 }
865
866 Value *LHSSrc = LHSCast->getOperand(0);
867 Value *RHSSrc = RHSCast->getOperand(0);
868
869 // Source types must match
870 if (LHSSrc->getType() != RHSSrc->getType())
871 return false;
872
873 auto *SrcTy = LHSSrc->getType();
874 auto *DstTy = I.getType();
875 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
876 // Other casts only handle vector types with integer elements.
877 if (CastOpcode != Instruction::BitCast &&
878 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
879 return false;
880
881 // Only integer scalar/vector values are legal for bitwise logic operations.
882 if (!SrcTy->getScalarType()->isIntegerTy() ||
883 !DstTy->getScalarType()->isIntegerTy())
884 return false;
885
886 // Cost Check :
887 // OldCost = bitlogic + 2*casts
888 // NewCost = bitlogic + cast
889
890 // Calculate specific costs for each cast with instruction context
892 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
894 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
895
896 InstructionCost OldCost =
897 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
898 LHSCastCost + RHSCastCost;
899
900 // For new cost, we can't provide an instruction (it doesn't exist yet)
901 InstructionCost GenericCastCost = TTI.getCastInstrCost(
902 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
903
904 InstructionCost NewCost =
905 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
906 GenericCastCost;
907
908 // Account for multi-use casts using specific costs
909 if (!LHSCast->hasOneUse())
910 NewCost += LHSCastCost;
911 if (!RHSCast->hasOneUse())
912 NewCost += RHSCastCost;
913
914 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
915 << " NewCost=" << NewCost << "\n");
916
917 if (NewCost > OldCost)
918 return false;
919
920 // Create the operation on the source type
921 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
922 BinOp->getName() + ".inner");
923 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
924 NewBinOp->copyIRFlags(BinOp);
925
926 Worklist.pushValue(NewOp);
927
928 // Create the cast operation directly to ensure we get a new instruction
929 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
930
931 // Preserve cast instruction flags
932 NewCast->copyIRFlags(LHSCast);
933 NewCast->andIRFlags(RHSCast);
934
935 // Insert the new instruction
936 Value *Result = Builder.Insert(NewCast);
937
938 replaceValue(I, *Result);
939 return true;
940}
941
942/// Match:
943// bitop(castop(x), C) ->
944// bitop(castop(x), castop(InvC)) ->
945// castop(bitop(x, InvC))
946// Supports: bitcast
947bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
949 Constant *C;
950
951 // Check if this is a bitwise logic operation
953 return false;
954
955 // Get the cast instructions
956 auto *LHSCast = dyn_cast<CastInst>(LHS);
957 if (!LHSCast)
958 return false;
959
960 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
961
962 // Only handle supported cast operations
963 switch (CastOpcode) {
964 case Instruction::BitCast:
965 case Instruction::ZExt:
966 case Instruction::SExt:
967 case Instruction::Trunc:
968 break;
969 default:
970 return false;
971 }
972
973 Value *LHSSrc = LHSCast->getOperand(0);
974
975 auto *SrcTy = LHSSrc->getType();
976 auto *DstTy = I.getType();
977 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
978 // Other casts only handle vector types with integer elements.
979 if (CastOpcode != Instruction::BitCast &&
980 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
981 return false;
982
983 // Only integer scalar/vector values are legal for bitwise logic operations.
984 if (!SrcTy->getScalarType()->isIntegerTy() ||
985 !DstTy->getScalarType()->isIntegerTy())
986 return false;
987
988 // Find the constant InvC, such that castop(InvC) equals to C.
989 PreservedCastFlags RHSFlags;
990 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
991 if (!InvC)
992 return false;
993
994 // Cost Check :
995 // OldCost = bitlogic + cast
996 // NewCost = bitlogic + cast
997
998 // Calculate specific costs for each cast with instruction context
1000 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1001
1002 InstructionCost OldCost =
1003 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1004
1005 // For new cost, we can't provide an instruction (it doesn't exist yet)
1006 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1007 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1008
1009 InstructionCost NewCost =
1010 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1011 GenericCastCost;
1012
1013 // Account for multi-use casts using specific costs
1014 if (!LHSCast->hasOneUse())
1015 NewCost += LHSCastCost;
1016
1017 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1018 << " NewCost=" << NewCost << "\n");
1019
1020 if (NewCost > OldCost)
1021 return false;
1022
1023 // Create the operation on the source type
1024 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1025 LHSSrc, InvC, I.getName() + ".inner");
1026 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1027 NewBinOp->copyIRFlags(&I);
1028
1029 Worklist.pushValue(NewOp);
1030
1031 // Create the cast operation directly to ensure we get a new instruction
1032 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1033
1034 // Insert the new instruction
1035 Value *Result = Builder.Insert(NewCast);
1036
1037 replaceValue(I, *Result);
1038 return true;
1039}
1040
1041/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1042/// destination type followed by shuffle. This can enable further transforms by
1043/// moving bitcasts or shuffles together.
1044bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1045 Value *V0, *V1;
1046 ArrayRef<int> Mask;
1047 if (!match(&I, m_BitCast(m_OneUse(
1048 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1049 return false;
1050
1051 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1052 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1053 // mask for scalable type is a splat or not.
1054 // 2) Disallow non-vector casts.
1055 // TODO: We could allow any shuffle.
1056 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1057 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1058 if (!DestTy || !SrcTy)
1059 return false;
1060
1061 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1062 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1063 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1064 return false;
1065
1066 bool IsUnary = isa<UndefValue>(V1);
1067
1068 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1069 // if it won't increase the number of bitcasts.
1070 if (!IsUnary) {
1073 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1074 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1075 return false;
1076 }
1077
1078 SmallVector<int, 16> NewMask;
1079 if (DestEltSize <= SrcEltSize) {
1080 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1081 // always be expanded to the equivalent form choosing narrower elements.
1082 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1083 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1084 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1085 } else {
1086 // The bitcast is from narrow elements to wide elements. The shuffle mask
1087 // must choose consecutive elements to allow casting first.
1088 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1089 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1090 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1091 return false;
1092 }
1093
1094 // Bitcast the shuffle src - keep its original width but using the destination
1095 // scalar type.
1096 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1097 auto *NewShuffleTy =
1098 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1099 auto *OldShuffleTy =
1100 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1101 unsigned NumOps = IsUnary ? 1 : 2;
1102
1103 // The new shuffle must not cost more than the old shuffle.
1107
1108 InstructionCost NewCost =
1109 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1110 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1111 TargetTransformInfo::CastContextHint::None,
1112 CostKind));
1113 InstructionCost OldCost =
1114 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1115 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1116 TargetTransformInfo::CastContextHint::None,
1117 CostKind);
1118
1119 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1120 << OldCost << " vs NewCost: " << NewCost << "\n");
1121
1122 if (NewCost > OldCost || !NewCost.isValid())
1123 return false;
1124
1125 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1126 ++NumShufOfBitcast;
1127 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1128 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1129 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1130 replaceValue(I, *Shuf);
1131 return true;
1132}
1133
1134/// VP Intrinsics whose vector operands are both splat values may be simplified
1135/// into the scalar version of the operation and the result splatted. This
1136/// can lead to scalarization down the line.
1137bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1138 if (!isa<VPIntrinsic>(I))
1139 return false;
1140 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1141 Value *Op0 = VPI.getArgOperand(0);
1142 Value *Op1 = VPI.getArgOperand(1);
1143
1144 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1145 return false;
1146
1147 // Check getSplatValue early in this function, to avoid doing unnecessary
1148 // work.
1149 Value *ScalarOp0 = getSplatValue(Op0);
1150 Value *ScalarOp1 = getSplatValue(Op1);
1151 if (!ScalarOp0 || !ScalarOp1)
1152 return false;
1153
1154 // For the binary VP intrinsics supported here, the result on disabled lanes
1155 // is a poison value. For now, only do this simplification if all lanes
1156 // are active.
1157 // TODO: Relax the condition that all lanes are active by using insertelement
1158 // on inactive lanes.
1159 auto IsAllTrueMask = [](Value *MaskVal) {
1160 if (Value *SplattedVal = getSplatValue(MaskVal))
1161 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1162 return ConstValue->isAllOnesValue();
1163 return false;
1164 };
1165 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1166 return false;
1167
1168 // Check to make sure we support scalarization of the intrinsic
1169 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1170 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1171 return false;
1172
1173 // Calculate cost of splatting both operands into vectors and the vector
1174 // intrinsic
1175 VectorType *VecTy = cast<VectorType>(VPI.getType());
1176 SmallVector<int> Mask;
1177 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1178 Mask.resize(FVTy->getNumElements(), 0);
1179 InstructionCost SplatCost =
1180 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1182 CostKind);
1183
1184 // Calculate the cost of the VP Intrinsic
1186 for (Value *V : VPI.args())
1187 Args.push_back(V->getType());
1188 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1189 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1190 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1191
1192 // Determine scalar opcode
1193 std::optional<unsigned> FunctionalOpcode =
1194 VPI.getFunctionalOpcode();
1195 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1196 if (!FunctionalOpcode) {
1197 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1198 if (!ScalarIntrID)
1199 return false;
1200 }
1201
1202 // Calculate cost of scalarizing
1203 InstructionCost ScalarOpCost = 0;
1204 if (ScalarIntrID) {
1205 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1206 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1207 } else {
1208 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1209 VecTy->getScalarType(), CostKind);
1210 }
1211
1212 // The existing splats may be kept around if other instructions use them.
1213 InstructionCost CostToKeepSplats =
1214 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1215 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1216
1217 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1218 << "\n");
1219 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1220 << ", Cost of scalarizing:" << NewCost << "\n");
1221
1222 // We want to scalarize unless the vector variant actually has lower cost.
1223 if (OldCost < NewCost || !NewCost.isValid())
1224 return false;
1225
1226 // Scalarize the intrinsic
1227 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1228 Value *EVL = VPI.getArgOperand(3);
1229
1230 // If the VP op might introduce UB or poison, we can scalarize it provided
1231 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1232 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1233 // scalarizing it.
1234 bool SafeToSpeculate;
1235 if (ScalarIntrID)
1236 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1237 .hasAttribute(Attribute::AttrKind::Speculatable);
1238 else
1240 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1241 if (!SafeToSpeculate &&
1242 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1243 return false;
1244
1245 Value *ScalarVal =
1246 ScalarIntrID
1247 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1248 {ScalarOp0, ScalarOp1})
1249 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1250 ScalarOp0, ScalarOp1);
1251
1252 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1253 return true;
1254}
1255
1256/// Match a vector op/compare/intrinsic with at least one
1257/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1258/// by insertelement.
1259bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1260 auto *UO = dyn_cast<UnaryOperator>(&I);
1261 auto *BO = dyn_cast<BinaryOperator>(&I);
1262 auto *CI = dyn_cast<CmpInst>(&I);
1263 auto *II = dyn_cast<IntrinsicInst>(&I);
1264 if (!UO && !BO && !CI && !II)
1265 return false;
1266
1267 // TODO: Allow intrinsics with different argument types
1268 if (II) {
1269 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1270 return false;
1271 for (auto [Idx, Arg] : enumerate(II->args()))
1272 if (Arg->getType() != II->getType() &&
1273 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1274 return false;
1275 }
1276
1277 // Do not convert the vector condition of a vector select into a scalar
1278 // condition. That may cause problems for codegen because of differences in
1279 // boolean formats and register-file transfers.
1280 // TODO: Can we account for that in the cost model?
1281 if (CI)
1282 for (User *U : I.users())
1283 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1284 return false;
1285
1286 // Match constant vectors or scalars being inserted into constant vectors:
1287 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1288 SmallVector<Value *> VecCs, ScalarOps;
1289 std::optional<uint64_t> Index;
1290
1291 auto Ops = II ? II->args() : I.operands();
1292 for (auto [OpNum, Op] : enumerate(Ops)) {
1293 Constant *VecC;
1294 Value *V;
1295 uint64_t InsIdx = 0;
1296 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1297 m_ConstantInt(InsIdx)))) {
1298 // Bail if any inserts are out of bounds.
1299 VectorType *OpTy = cast<VectorType>(Op->getType());
1300 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1301 return false;
1302 // All inserts must have the same index.
1303 // TODO: Deal with mismatched index constants and variable indexes?
1304 if (!Index)
1305 Index = InsIdx;
1306 else if (InsIdx != *Index)
1307 return false;
1308 VecCs.push_back(VecC);
1309 ScalarOps.push_back(V);
1310 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1311 OpNum, &TTI)) {
1312 VecCs.push_back(Op.get());
1313 ScalarOps.push_back(Op.get());
1314 } else if (match(Op.get(), m_Constant(VecC))) {
1315 VecCs.push_back(VecC);
1316 ScalarOps.push_back(nullptr);
1317 } else {
1318 return false;
1319 }
1320 }
1321
1322 // Bail if all operands are constant.
1323 if (!Index.has_value())
1324 return false;
1325
1326 VectorType *VecTy = cast<VectorType>(I.getType());
1327 Type *ScalarTy = VecTy->getScalarType();
1328 assert(VecTy->isVectorTy() &&
1329 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1330 ScalarTy->isPointerTy()) &&
1331 "Unexpected types for insert element into binop or cmp");
1332
1333 unsigned Opcode = I.getOpcode();
1334 InstructionCost ScalarOpCost, VectorOpCost;
1335 if (CI) {
1336 CmpInst::Predicate Pred = CI->getPredicate();
1337 ScalarOpCost = TTI.getCmpSelInstrCost(
1338 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1339 VectorOpCost = TTI.getCmpSelInstrCost(
1340 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1341 } else if (UO || BO) {
1342 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1343 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1344 } else {
1345 IntrinsicCostAttributes ScalarICA(
1346 II->getIntrinsicID(), ScalarTy,
1347 SmallVector<Type *>(II->arg_size(), ScalarTy));
1348 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1349 IntrinsicCostAttributes VectorICA(
1350 II->getIntrinsicID(), VecTy,
1351 SmallVector<Type *>(II->arg_size(), VecTy));
1352 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1353 }
1354
1355 // Fold the vector constants in the original vectors into a new base vector to
1356 // get more accurate cost modelling.
1357 Value *NewVecC = nullptr;
1358 if (CI)
1359 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1360 else if (UO)
1361 NewVecC =
1362 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1363 else if (BO)
1364 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1365 else if (II)
1366 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1367
1368 if (!NewVecC)
1369 return false;
1370
1371 // Get cost estimate for the insert element. This cost will factor into
1372 // both sequences.
1373 InstructionCost OldCost = VectorOpCost;
1374 InstructionCost NewCost =
1375 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1376 CostKind, *Index, NewVecC);
1377
1378 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1379 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1380 II->getIntrinsicID(), Idx, &TTI)))
1381 continue;
1383 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1384 OldCost += InsertCost;
1385 NewCost += !Op->hasOneUse() * InsertCost;
1386 }
1387
1388 // We want to scalarize unless the vector variant actually has lower cost.
1389 if (OldCost < NewCost || !NewCost.isValid())
1390 return false;
1391
1392 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1393 // inselt NewVecC, (scalar_op V0, V1), Index
1394 if (CI)
1395 ++NumScalarCmp;
1396 else if (UO || BO)
1397 ++NumScalarOps;
1398 else
1399 ++NumScalarIntrinsic;
1400
1401 // For constant cases, extract the scalar element, this should constant fold.
1402 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1403 if (!Scalar)
1405 cast<Constant>(VecC), Builder.getInt64(*Index));
1406
1407 Value *Scalar;
1408 if (CI)
1409 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1410 else if (UO || BO)
1411 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1412 else
1413 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1414
1415 Scalar->setName(I.getName() + ".scalar");
1416
1417 // All IR flags are safe to back-propagate. There is no potential for extra
1418 // poison to be created by the scalar instruction.
1419 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1420 ScalarInst->copyIRFlags(&I);
1421
1422 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1423 replaceValue(I, *Insert);
1424 return true;
1425}
1426
1427/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1428/// a vector into vector operations followed by extract. Note: The SLP pass
1429/// may miss this pattern because of implementation problems.
1430bool VectorCombine::foldExtractedCmps(Instruction &I) {
1431 auto *BI = dyn_cast<BinaryOperator>(&I);
1432
1433 // We are looking for a scalar binop of booleans.
1434 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1435 if (!BI || !I.getType()->isIntegerTy(1))
1436 return false;
1437
1438 // The compare predicates should match, and each compare should have a
1439 // constant operand.
1440 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1441 Instruction *I0, *I1;
1442 Constant *C0, *C1;
1443 CmpPredicate P0, P1;
1444 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1445 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1446 return false;
1447
1448 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1449 if (!MatchingPred)
1450 return false;
1451
1452 // The compare operands must be extracts of the same vector with constant
1453 // extract indexes.
1454 Value *X;
1455 uint64_t Index0, Index1;
1456 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1457 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1458 return false;
1459
1460 auto *Ext0 = cast<ExtractElementInst>(I0);
1461 auto *Ext1 = cast<ExtractElementInst>(I1);
1462 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1463 if (!ConvertToShuf)
1464 return false;
1465 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1466 "Unknown ExtractElementInst");
1467
1468 // The original scalar pattern is:
1469 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1470 CmpInst::Predicate Pred = *MatchingPred;
1471 unsigned CmpOpcode =
1472 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1473 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1474 if (!VecTy)
1475 return false;
1476
1477 InstructionCost Ext0Cost =
1478 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1479 InstructionCost Ext1Cost =
1480 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1482 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1483 CostKind);
1484
1485 InstructionCost OldCost =
1486 Ext0Cost + Ext1Cost + CmpCost * 2 +
1487 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1488
1489 // The proposed vector pattern is:
1490 // vcmp = cmp Pred X, VecC
1491 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1492 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1493 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1496 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1497 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1498 ShufMask[CheapIndex] = ExpensiveIndex;
1500 CmpTy, ShufMask, CostKind);
1501 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1502 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1503 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1504 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1505
1506 // Aggressively form vector ops if the cost is equal because the transform
1507 // may enable further optimization.
1508 // Codegen can reverse this transform (scalarize) if it was not profitable.
1509 if (OldCost < NewCost || !NewCost.isValid())
1510 return false;
1511
1512 // Create a vector constant from the 2 scalar constants.
1513 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1514 PoisonValue::get(VecTy->getElementType()));
1515 CmpC[Index0] = C0;
1516 CmpC[Index1] = C1;
1517 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1518 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1519 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1520 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1521 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1522 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1523 replaceValue(I, *NewExt);
1524 ++NumVecCmpBO;
1525 return true;
1526}
1527
1530 const TargetTransformInfo &TTI,
1531 InstructionCost &CostBeforeReduction,
1532 InstructionCost &CostAfterReduction) {
1533 Instruction *Op0, *Op1;
1534 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1535 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1536 unsigned ReductionOpc =
1537 getArithmeticReductionInstruction(II.getIntrinsicID());
1538 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1539 bool IsUnsigned = isa<ZExtInst>(RedOp);
1540 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1541
1542 CostBeforeReduction =
1543 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1545 CostAfterReduction =
1546 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1547 ExtType, FastMathFlags(), CostKind);
1548 return;
1549 }
1550 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1551 match(RedOp,
1553 match(Op0, m_ZExtOrSExt(m_Value())) &&
1554 Op0->getOpcode() == Op1->getOpcode() &&
1555 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1556 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1557 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1558 bool IsUnsigned = isa<ZExtInst>(Op0);
1559 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1560 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1561
1562 InstructionCost ExtCost =
1563 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1565 InstructionCost MulCost =
1566 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1567 InstructionCost Ext2Cost =
1568 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1570
1571 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1572 CostAfterReduction = TTI.getMulAccReductionCost(
1573 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1574 return;
1575 }
1576 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1577 std::nullopt, CostKind);
1578}
1579
1580bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1581 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1582 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1583 if (BinOpOpc == Instruction::Sub)
1584 ReductionIID = Intrinsic::vector_reduce_add;
1585 if (ReductionIID == Intrinsic::not_intrinsic)
1586 return false;
1587
1588 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1589 Intrinsic::ID IID) -> Value * {
1590 auto *II = dyn_cast<IntrinsicInst>(V);
1591 if (!II)
1592 return nullptr;
1593 if (II->getIntrinsicID() == IID && II->hasOneUse())
1594 return II->getArgOperand(0);
1595 return nullptr;
1596 };
1597
1598 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1599 if (!V0)
1600 return false;
1601 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1602 if (!V1)
1603 return false;
1604
1605 auto *VTy = cast<VectorType>(V0->getType());
1606 if (V1->getType() != VTy)
1607 return false;
1608 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1609 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1610 unsigned ReductionOpc =
1611 getArithmeticReductionInstruction(II0.getIntrinsicID());
1612
1613 InstructionCost OldCost = 0;
1614 InstructionCost NewCost = 0;
1615 InstructionCost CostOfRedOperand0 = 0;
1616 InstructionCost CostOfRed0 = 0;
1617 InstructionCost CostOfRedOperand1 = 0;
1618 InstructionCost CostOfRed1 = 0;
1619 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1620 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1621 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1622 NewCost =
1623 CostOfRedOperand0 + CostOfRedOperand1 +
1624 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1625 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1626 if (NewCost >= OldCost || !NewCost.isValid())
1627 return false;
1628
1629 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1630 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1631 << "\n");
1632 Value *VectorBO;
1633 if (BinOpOpc == Instruction::Or)
1634 VectorBO = Builder.CreateOr(V0, V1, "",
1635 cast<PossiblyDisjointInst>(I).isDisjoint());
1636 else
1637 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1638
1639 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1640 replaceValue(I, *Rdx);
1641 return true;
1642}
1643
1644// Check if memory loc modified between two instrs in the same BB
1647 const MemoryLocation &Loc, AAResults &AA) {
1648 unsigned NumScanned = 0;
1649 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1650 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1651 ++NumScanned > MaxInstrsToScan;
1652 });
1653}
1654
1655namespace {
1656/// Helper class to indicate whether a vector index can be safely scalarized and
1657/// if a freeze needs to be inserted.
1658class ScalarizationResult {
1659 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1660
1661 StatusTy Status;
1662 Value *ToFreeze;
1663
1664 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1665 : Status(Status), ToFreeze(ToFreeze) {}
1666
1667public:
1668 ScalarizationResult(const ScalarizationResult &Other) = default;
1669 ~ScalarizationResult() {
1670 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1671 }
1672
1673 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1674 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1675 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1676 return {StatusTy::SafeWithFreeze, ToFreeze};
1677 }
1678
1679 /// Returns true if the index can be scalarize without requiring a freeze.
1680 bool isSafe() const { return Status == StatusTy::Safe; }
1681 /// Returns true if the index cannot be scalarized.
1682 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1683 /// Returns true if the index can be scalarize, but requires inserting a
1684 /// freeze.
1685 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1686
1687 /// Reset the state of Unsafe and clear ToFreze if set.
1688 void discard() {
1689 ToFreeze = nullptr;
1690 Status = StatusTy::Unsafe;
1691 }
1692
1693 /// Freeze the ToFreeze and update the use in \p User to use it.
1694 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1695 assert(isSafeWithFreeze() &&
1696 "should only be used when freezing is required");
1697 assert(is_contained(ToFreeze->users(), &UserI) &&
1698 "UserI must be a user of ToFreeze");
1699 IRBuilder<>::InsertPointGuard Guard(Builder);
1700 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1701 Value *Frozen =
1702 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1703 for (Use &U : make_early_inc_range((UserI.operands())))
1704 if (U.get() == ToFreeze)
1705 U.set(Frozen);
1706
1707 ToFreeze = nullptr;
1708 }
1709};
1710} // namespace
1711
1712/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1713/// Idx. \p Idx must access a valid vector element.
1714static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1715 Instruction *CtxI,
1716 AssumptionCache &AC,
1717 const DominatorTree &DT) {
1718 // We do checks for both fixed vector types and scalable vector types.
1719 // This is the number of elements of fixed vector types,
1720 // or the minimum number of elements of scalable vector types.
1721 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1722 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1723
1724 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1725 if (C->getValue().ult(NumElements))
1726 return ScalarizationResult::safe();
1727 return ScalarizationResult::unsafe();
1728 }
1729
1730 // Always unsafe if the index type can't handle all inbound values.
1731 if (!llvm::isUIntN(IntWidth, NumElements))
1732 return ScalarizationResult::unsafe();
1733
1734 APInt Zero(IntWidth, 0);
1735 APInt MaxElts(IntWidth, NumElements);
1736 ConstantRange ValidIndices(Zero, MaxElts);
1737 ConstantRange IdxRange(IntWidth, true);
1738
1739 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1740 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1741 true, &AC, CtxI, &DT)))
1742 return ScalarizationResult::safe();
1743 return ScalarizationResult::unsafe();
1744 }
1745
1746 // If the index may be poison, check if we can insert a freeze before the
1747 // range of the index is restricted.
1748 Value *IdxBase;
1749 ConstantInt *CI;
1750 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1751 IdxRange = IdxRange.binaryAnd(CI->getValue());
1752 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1753 IdxRange = IdxRange.urem(CI->getValue());
1754 }
1755
1756 if (ValidIndices.contains(IdxRange))
1757 return ScalarizationResult::safeWithFreeze(IdxBase);
1758 return ScalarizationResult::unsafe();
1759}
1760
1761/// The memory operation on a vector of \p ScalarType had alignment of
1762/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1763/// alignment that will be valid for the memory operation on a single scalar
1764/// element of the same type with index \p Idx.
1766 Type *ScalarType, Value *Idx,
1767 const DataLayout &DL) {
1768 if (auto *C = dyn_cast<ConstantInt>(Idx))
1769 return commonAlignment(VectorAlignment,
1770 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1771 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1772}
1773
1774// Combine patterns like:
1775// %0 = load <4 x i32>, <4 x i32>* %a
1776// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1777// store <4 x i32> %1, <4 x i32>* %a
1778// to:
1779// %0 = bitcast <4 x i32>* %a to i32*
1780// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1781// store i32 %b, i32* %1
1782bool VectorCombine::foldSingleElementStore(Instruction &I) {
1784 return false;
1785 auto *SI = cast<StoreInst>(&I);
1786 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1787 return false;
1788
1789 // TODO: Combine more complicated patterns (multiple insert) by referencing
1790 // TargetTransformInfo.
1792 Value *NewElement;
1793 Value *Idx;
1794 if (!match(SI->getValueOperand(),
1795 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1796 m_Value(Idx))))
1797 return false;
1798
1799 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1800 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1801 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1802 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1803 // modified between, vector type matches store size, and index is inbounds.
1804 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1805 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1806 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1807 return false;
1808
1809 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1810 if (ScalarizableIdx.isUnsafe() ||
1811 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1812 MemoryLocation::get(SI), AA))
1813 return false;
1814
1815 // Ensure we add the load back to the worklist BEFORE its users so they can
1816 // erased in the correct order.
1817 Worklist.push(Load);
1818
1819 if (ScalarizableIdx.isSafeWithFreeze())
1820 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1821 Value *GEP = Builder.CreateInBoundsGEP(
1822 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1823 {ConstantInt::get(Idx->getType(), 0), Idx});
1824 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1825 NSI->copyMetadata(*SI);
1826 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1827 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1828 *DL);
1829 NSI->setAlignment(ScalarOpAlignment);
1830 replaceValue(I, *NSI);
1832 return true;
1833 }
1834
1835 return false;
1836}
1837
1838/// Try to scalarize vector loads feeding extractelement instructions.
1839bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1841 return false;
1842
1843 Value *Ptr;
1844 if (!match(&I, m_Load(m_Value(Ptr))))
1845 return false;
1846
1847 auto *LI = cast<LoadInst>(&I);
1848 auto *VecTy = cast<VectorType>(LI->getType());
1849 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1850 return false;
1851
1852 InstructionCost OriginalCost =
1853 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1854 LI->getPointerAddressSpace(), CostKind);
1855 InstructionCost ScalarizedCost = 0;
1856
1857 Instruction *LastCheckedInst = LI;
1858 unsigned NumInstChecked = 0;
1859 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1860 auto FailureGuard = make_scope_exit([&]() {
1861 // If the transform is aborted, discard the ScalarizationResults.
1862 for (auto &Pair : NeedFreeze)
1863 Pair.second.discard();
1864 });
1865
1866 // Check if all users of the load are extracts with no memory modifications
1867 // between the load and the extract. Compute the cost of both the original
1868 // code and the scalarized version.
1869 for (User *U : LI->users()) {
1870 auto *UI = dyn_cast<ExtractElementInst>(U);
1871 if (!UI || UI->getParent() != LI->getParent())
1872 return false;
1873
1874 // If any extract is waiting to be erased, then bail out as this will
1875 // distort the cost calculation and possibly lead to infinite loops.
1876 if (UI->use_empty())
1877 return false;
1878
1879 // Check if any instruction between the load and the extract may modify
1880 // memory.
1881 if (LastCheckedInst->comesBefore(UI)) {
1882 for (Instruction &I :
1883 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1884 // Bail out if we reached the check limit or the instruction may write
1885 // to memory.
1886 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1887 return false;
1888 NumInstChecked++;
1889 }
1890 LastCheckedInst = UI;
1891 }
1892
1893 auto ScalarIdx =
1894 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1895 if (ScalarIdx.isUnsafe())
1896 return false;
1897 if (ScalarIdx.isSafeWithFreeze()) {
1898 NeedFreeze.try_emplace(UI, ScalarIdx);
1899 ScalarIdx.discard();
1900 }
1901
1902 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1903 OriginalCost +=
1904 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1905 Index ? Index->getZExtValue() : -1);
1906 ScalarizedCost +=
1907 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1908 Align(1), LI->getPointerAddressSpace(), CostKind);
1909 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1910 nullptr, nullptr, CostKind);
1911 }
1912
1913 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
1914 << "\n LoadExtractCost: " << OriginalCost
1915 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1916
1917 if (ScalarizedCost >= OriginalCost)
1918 return false;
1919
1920 // Ensure we add the load back to the worklist BEFORE its users so they can
1921 // erased in the correct order.
1922 Worklist.push(LI);
1923
1924 Type *ElemType = VecTy->getElementType();
1925
1926 // Replace extracts with narrow scalar loads.
1927 for (User *U : LI->users()) {
1928 auto *EI = cast<ExtractElementInst>(U);
1929 Value *Idx = EI->getIndexOperand();
1930
1931 // Insert 'freeze' for poison indexes.
1932 auto It = NeedFreeze.find(EI);
1933 if (It != NeedFreeze.end())
1934 It->second.freeze(Builder, *cast<Instruction>(Idx));
1935
1936 Builder.SetInsertPoint(EI);
1937 Value *GEP =
1938 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1939 auto *NewLoad = cast<LoadInst>(
1940 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1941
1942 Align ScalarOpAlignment =
1943 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1944 NewLoad->setAlignment(ScalarOpAlignment);
1945
1946 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1947 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1948 AAMDNodes OldAAMD = LI->getAAMetadata();
1949 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1950 }
1951
1952 replaceValue(*EI, *NewLoad, false);
1953 }
1954
1955 FailureGuard.release();
1956 return true;
1957}
1958
1959bool VectorCombine::scalarizeExtExtract(Instruction &I) {
1961 return false;
1962 auto *Ext = dyn_cast<ZExtInst>(&I);
1963 if (!Ext)
1964 return false;
1965
1966 // Try to convert a vector zext feeding only extracts to a set of scalar
1967 // (Src << ExtIdx *Size) & (Size -1)
1968 // if profitable .
1969 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
1970 if (!SrcTy)
1971 return false;
1972 auto *DstTy = cast<FixedVectorType>(Ext->getType());
1973
1974 Type *ScalarDstTy = DstTy->getElementType();
1975 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
1976 return false;
1977
1978 InstructionCost VectorCost =
1979 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
1981 unsigned ExtCnt = 0;
1982 bool ExtLane0 = false;
1983 for (User *U : Ext->users()) {
1984 uint64_t Idx;
1985 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
1986 return false;
1987 if (cast<Instruction>(U)->use_empty())
1988 continue;
1989 ExtCnt += 1;
1990 ExtLane0 |= !Idx;
1991 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
1992 CostKind, Idx, U);
1993 }
1994
1995 InstructionCost ScalarCost =
1996 ExtCnt * TTI.getArithmeticInstrCost(
1997 Instruction::And, ScalarDstTy, CostKind,
2000 (ExtCnt - ExtLane0) *
2002 Instruction::LShr, ScalarDstTy, CostKind,
2005 if (ScalarCost > VectorCost)
2006 return false;
2007
2008 Value *ScalarV = Ext->getOperand(0);
2009 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2010 &DT))
2011 ScalarV = Builder.CreateFreeze(ScalarV);
2012 ScalarV = Builder.CreateBitCast(
2013 ScalarV,
2014 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2015 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2016 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2017 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2018 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2019 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2020 for (User *U : Ext->users()) {
2021 auto *Extract = cast<ExtractElementInst>(U);
2022 uint64_t Idx =
2023 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2024 uint64_t ShiftAmt =
2025 DL->isBigEndian()
2026 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2027 : (Idx * SrcEltSizeInBits);
2028 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2029 Value *And = Builder.CreateAnd(LShr, Mask);
2030 U->replaceAllUsesWith(And);
2031 }
2032 return true;
2033}
2034
2035/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2036/// to "(bitcast (concat X, Y))"
2037/// where X/Y are bitcasted from i1 mask vectors.
2038bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2039 Type *Ty = I.getType();
2040 if (!Ty->isIntegerTy())
2041 return false;
2042
2043 // TODO: Add big endian test coverage
2044 if (DL->isBigEndian())
2045 return false;
2046
2047 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2048 Instruction *X, *Y;
2050 return false;
2051
2052 // Allow both sources to contain shl, to handle more generic pattern:
2053 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2054 Value *SrcX;
2055 uint64_t ShAmtX = 0;
2056 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2057 !match(X, m_OneUse(
2059 m_ConstantInt(ShAmtX)))))
2060 return false;
2061
2062 Value *SrcY;
2063 uint64_t ShAmtY = 0;
2064 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2065 !match(Y, m_OneUse(
2067 m_ConstantInt(ShAmtY)))))
2068 return false;
2069
2070 // Canonicalize larger shift to the RHS.
2071 if (ShAmtX > ShAmtY) {
2072 std::swap(X, Y);
2073 std::swap(SrcX, SrcY);
2074 std::swap(ShAmtX, ShAmtY);
2075 }
2076
2077 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2078 // difference is the mask width so they can be easily concatenated together.
2079 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2080 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2081 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2082 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2083 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2084 !MaskTy->getElementType()->isIntegerTy(1) ||
2085 MaskTy->getNumElements() != ShAmtDiff ||
2086 MaskTy->getNumElements() > (BitWidth / 2))
2087 return false;
2088
2089 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2090 auto *ConcatIntTy =
2091 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2092 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2093
2094 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2095 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2096
2097 // TODO: Is it worth supporting multi use cases?
2098 InstructionCost OldCost = 0;
2099 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2100 OldCost +=
2101 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2102 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2104 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2106
2107 InstructionCost NewCost = 0;
2109 MaskTy, ConcatMask, CostKind);
2110 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2112 if (Ty != ConcatIntTy)
2113 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2115 if (ShAmtX > 0)
2116 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2117
2118 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2119 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2120 << "\n");
2121
2122 if (NewCost > OldCost)
2123 return false;
2124
2125 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2126 // any residual zero-extension or shifting.
2127 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2128 Worklist.pushValue(Concat);
2129
2130 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2131
2132 if (Ty != ConcatIntTy) {
2133 Worklist.pushValue(Result);
2134 Result = Builder.CreateZExt(Result, Ty);
2135 }
2136
2137 if (ShAmtX > 0) {
2138 Worklist.pushValue(Result);
2139 Result = Builder.CreateShl(Result, ShAmtX);
2140 }
2141
2142 replaceValue(I, *Result);
2143 return true;
2144}
2145
2146/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2147/// --> "binop (shuffle), (shuffle)".
2148bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2149 BinaryOperator *BinOp;
2150 ArrayRef<int> OuterMask;
2151 if (!match(&I,
2152 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2153 return false;
2154
2155 // Don't introduce poison into div/rem.
2156 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2157 return false;
2158
2159 Value *Op00, *Op01, *Op10, *Op11;
2160 ArrayRef<int> Mask0, Mask1;
2161 bool Match0 =
2162 match(BinOp->getOperand(0),
2163 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2164 bool Match1 =
2165 match(BinOp->getOperand(1),
2166 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2167 if (!Match0 && !Match1)
2168 return false;
2169
2170 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2171 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2172 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2173 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2174
2175 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2176 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2177 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2178 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2179 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2180 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2181 return false;
2182
2183 unsigned NumSrcElts = BinOpTy->getNumElements();
2184
2185 // Don't accept shuffles that reference the second operand in
2186 // div/rem or if its an undef arg.
2187 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2188 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2189 return false;
2190
2191 // Merge outer / inner (or identity if no match) shuffles.
2192 SmallVector<int> NewMask0, NewMask1;
2193 for (int M : OuterMask) {
2194 if (M < 0 || M >= (int)NumSrcElts) {
2195 NewMask0.push_back(PoisonMaskElem);
2196 NewMask1.push_back(PoisonMaskElem);
2197 } else {
2198 NewMask0.push_back(Match0 ? Mask0[M] : M);
2199 NewMask1.push_back(Match1 ? Mask1[M] : M);
2200 }
2201 }
2202
2203 unsigned NumOpElts = Op0Ty->getNumElements();
2204 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2205 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2206 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2207 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2208 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2209 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2210
2211 // Try to merge shuffles across the binop if the new shuffles are not costly.
2212 InstructionCost OldCost =
2213 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2215 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2216 if (Match0)
2217 OldCost += TTI.getShuffleCost(
2218 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2219 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2220 if (Match1)
2221 OldCost += TTI.getShuffleCost(
2222 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2223 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2224
2225 InstructionCost NewCost =
2226 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2227
2228 if (!IsIdentity0)
2229 NewCost +=
2231 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2232 if (!IsIdentity1)
2233 NewCost +=
2235 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2236
2237 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2238 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2239 << "\n");
2240
2241 // If costs are equal, still fold as we reduce instruction count.
2242 if (NewCost > OldCost)
2243 return false;
2244
2245 Value *LHS =
2246 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2247 Value *RHS =
2248 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2249 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2250
2251 // Intersect flags from the old binops.
2252 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2253 NewInst->copyIRFlags(BinOp);
2254
2255 Worklist.pushValue(LHS);
2256 Worklist.pushValue(RHS);
2257 replaceValue(I, *NewBO);
2258 return true;
2259}
2260
2261/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2262/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2263bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2264 ArrayRef<int> OldMask;
2265 Instruction *LHS, *RHS;
2267 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2268 return false;
2269
2270 // TODO: Add support for addlike etc.
2271 if (LHS->getOpcode() != RHS->getOpcode())
2272 return false;
2273
2274 Value *X, *Y, *Z, *W;
2275 bool IsCommutative = false;
2276 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2277 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2278 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2279 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2280 auto *BO = cast<BinaryOperator>(LHS);
2281 // Don't introduce poison into div/rem.
2282 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2283 return false;
2284 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2285 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2286 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2287 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2288 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2289 } else
2290 return false;
2291
2292 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2293 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2294 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2295 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2296 return false;
2297
2298 unsigned NumSrcElts = BinOpTy->getNumElements();
2299
2300 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2301 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2302 std::swap(X, Y);
2303
2304 auto ConvertToUnary = [NumSrcElts](int &M) {
2305 if (M >= (int)NumSrcElts)
2306 M -= NumSrcElts;
2307 };
2308
2309 SmallVector<int> NewMask0(OldMask);
2311 if (X == Z) {
2312 llvm::for_each(NewMask0, ConvertToUnary);
2314 Z = PoisonValue::get(BinOpTy);
2315 }
2316
2317 SmallVector<int> NewMask1(OldMask);
2319 if (Y == W) {
2320 llvm::for_each(NewMask1, ConvertToUnary);
2322 W = PoisonValue::get(BinOpTy);
2323 }
2324
2325 // Try to replace a binop with a shuffle if the shuffle is not costly.
2326 InstructionCost OldCost =
2330 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2331 &I);
2332
2333 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2334 // where one use shuffles have gotten split across the binop/cmp. These
2335 // often allow a major reduction in total cost that wouldn't happen as
2336 // individual folds.
2337 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2338 TTI::TargetCostKind CostKind) -> bool {
2339 Value *InnerOp;
2340 ArrayRef<int> InnerMask;
2341 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2342 m_Mask(InnerMask)))) &&
2343 InnerOp->getType() == Op->getType() &&
2344 all_of(InnerMask,
2345 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2346 for (int &M : Mask)
2347 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2348 M = InnerMask[M - Offset];
2349 M = 0 <= M ? M + Offset : M;
2350 }
2352 Op = InnerOp;
2353 return true;
2354 }
2355 return false;
2356 };
2357 bool ReducedInstCount = false;
2358 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2359 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2360 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2361 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2362
2363 auto *ShuffleCmpTy =
2364 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2365 InstructionCost NewCost =
2366 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2367 nullptr, {X, Z}) +
2368 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2369 nullptr, {Y, W});
2370
2371 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2372 NewCost +=
2373 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2374 } else {
2375 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2376 ShuffleDstTy, PredLHS, CostKind);
2377 }
2378
2379 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2380 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2381 << "\n");
2382
2383 // If either shuffle will constant fold away, then fold for the same cost as
2384 // we will reduce the instruction count.
2385 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2386 (isa<Constant>(Y) && isa<Constant>(W));
2387 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2388 return false;
2389
2390 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2391 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2392 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2393 ? Builder.CreateBinOp(
2394 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2395 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2396
2397 // Intersect flags from the old binops.
2398 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2399 NewInst->copyIRFlags(LHS);
2400 NewInst->andIRFlags(RHS);
2401 }
2402
2403 Worklist.pushValue(Shuf0);
2404 Worklist.pushValue(Shuf1);
2405 replaceValue(I, *NewBO);
2406 return true;
2407}
2408
2409/// Try to convert,
2410/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2411/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2412bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2413 ArrayRef<int> Mask;
2414 Value *C1, *T1, *F1, *C2, *T2, *F2;
2415 if (!match(&I, m_Shuffle(
2417 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2418 m_Mask(Mask))))
2419 return false;
2420
2421 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2422 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2423 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2424 return false;
2425
2426 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2427 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2428 // SelectInsts must have the same FMF.
2429 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2430 ((SI0FOp != nullptr) &&
2431 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2432 return false;
2433
2434 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2435 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2437 auto SelOp = Instruction::Select;
2439 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2440 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2442 OldCost +=
2443 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2444 {I.getOperand(0), I.getOperand(1)}, &I);
2445
2447 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2448 Mask, CostKind, 0, nullptr, {C1, C2});
2449 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2450 nullptr, {T1, T2});
2451 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2452 nullptr, {F1, F2});
2453 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2454 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2455 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2457
2458 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2459 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2460 << "\n");
2461 if (NewCost > OldCost)
2462 return false;
2463
2464 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2465 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2466 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2467 Value *NewSel;
2468 // We presuppose that the SelectInsts have the same FMF.
2469 if (SI0FOp)
2470 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2471 SI0FOp->getFastMathFlags());
2472 else
2473 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2474
2475 Worklist.pushValue(ShuffleCmp);
2476 Worklist.pushValue(ShuffleTrue);
2477 Worklist.pushValue(ShuffleFalse);
2478 replaceValue(I, *NewSel);
2479 return true;
2480}
2481
2482/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2483/// into "castop (shuffle)".
2484bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2485 Value *V0, *V1;
2486 ArrayRef<int> OldMask;
2487 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2488 return false;
2489
2490 auto *C0 = dyn_cast<CastInst>(V0);
2491 auto *C1 = dyn_cast<CastInst>(V1);
2492 if (!C0 || !C1)
2493 return false;
2494
2495 Instruction::CastOps Opcode = C0->getOpcode();
2496 if (C0->getSrcTy() != C1->getSrcTy())
2497 return false;
2498
2499 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2500 if (Opcode != C1->getOpcode()) {
2501 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2502 Opcode = Instruction::SExt;
2503 else
2504 return false;
2505 }
2506
2507 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2508 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2509 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2510 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2511 return false;
2512
2513 unsigned NumSrcElts = CastSrcTy->getNumElements();
2514 unsigned NumDstElts = CastDstTy->getNumElements();
2515 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2516 "Only bitcasts expected to alter src/dst element counts");
2517
2518 // Check for bitcasting of unscalable vector types.
2519 // e.g. <32 x i40> -> <40 x i32>
2520 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2521 (NumDstElts % NumSrcElts) != 0)
2522 return false;
2523
2524 SmallVector<int, 16> NewMask;
2525 if (NumSrcElts >= NumDstElts) {
2526 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2527 // always be expanded to the equivalent form choosing narrower elements.
2528 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2529 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2530 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2531 } else {
2532 // The bitcast is from narrow elements to wide elements. The shuffle mask
2533 // must choose consecutive elements to allow casting first.
2534 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2535 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2536 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2537 return false;
2538 }
2539
2540 auto *NewShuffleDstTy =
2541 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2542
2543 // Try to replace a castop with a shuffle if the shuffle is not costly.
2544 InstructionCost CostC0 =
2545 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2547 InstructionCost CostC1 =
2548 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2550 InstructionCost OldCost = CostC0 + CostC1;
2551 OldCost +=
2553 CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
2554
2555 InstructionCost NewCost =
2557 CastSrcTy, NewMask, CostKind);
2558 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2560 if (!C0->hasOneUse())
2561 NewCost += CostC0;
2562 if (!C1->hasOneUse())
2563 NewCost += CostC1;
2564
2565 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2566 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2567 << "\n");
2568 if (NewCost > OldCost)
2569 return false;
2570
2571 Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
2572 C1->getOperand(0), NewMask);
2573 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2574
2575 // Intersect flags from the old casts.
2576 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2577 NewInst->copyIRFlags(C0);
2578 NewInst->andIRFlags(C1);
2579 }
2580
2581 Worklist.pushValue(Shuf);
2582 replaceValue(I, *Cast);
2583 return true;
2584}
2585
2586/// Try to convert any of:
2587/// "shuffle (shuffle x, y), (shuffle y, x)"
2588/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2589/// "shuffle (shuffle x, undef), y"
2590/// "shuffle x, (shuffle y, undef)"
2591/// into "shuffle x, y".
2592bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2593 ArrayRef<int> OuterMask;
2594 Value *OuterV0, *OuterV1;
2595 if (!match(&I,
2596 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2597 return false;
2598
2599 ArrayRef<int> InnerMask0, InnerMask1;
2600 Value *X0, *X1, *Y0, *Y1;
2601 bool Match0 =
2602 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2603 bool Match1 =
2604 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2605 if (!Match0 && !Match1)
2606 return false;
2607
2608 // If the outer shuffle is a permute, then create a fake inner all-poison
2609 // shuffle. This is easier than accounting for length-changing shuffles below.
2610 SmallVector<int, 16> PoisonMask1;
2611 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2612 X1 = X0;
2613 Y1 = Y0;
2614 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2615 InnerMask1 = PoisonMask1;
2616 Match1 = true; // fake match
2617 }
2618
2619 X0 = Match0 ? X0 : OuterV0;
2620 Y0 = Match0 ? Y0 : OuterV0;
2621 X1 = Match1 ? X1 : OuterV1;
2622 Y1 = Match1 ? Y1 : OuterV1;
2623 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2624 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2625 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2626 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2627 X0->getType() != X1->getType())
2628 return false;
2629
2630 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2631 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2632
2633 // Attempt to merge shuffles, matching upto 2 source operands.
2634 // Replace index to a poison arg with PoisonMaskElem.
2635 // Bail if either inner masks reference an undef arg.
2636 SmallVector<int, 16> NewMask(OuterMask);
2637 Value *NewX = nullptr, *NewY = nullptr;
2638 for (int &M : NewMask) {
2639 Value *Src = nullptr;
2640 if (0 <= M && M < (int)NumImmElts) {
2641 Src = OuterV0;
2642 if (Match0) {
2643 M = InnerMask0[M];
2644 Src = M >= (int)NumSrcElts ? Y0 : X0;
2645 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2646 }
2647 } else if (M >= (int)NumImmElts) {
2648 Src = OuterV1;
2649 M -= NumImmElts;
2650 if (Match1) {
2651 M = InnerMask1[M];
2652 Src = M >= (int)NumSrcElts ? Y1 : X1;
2653 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2654 }
2655 }
2656 if (Src && M != PoisonMaskElem) {
2657 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2658 if (isa<UndefValue>(Src)) {
2659 // We've referenced an undef element - if its poison, update the shuffle
2660 // mask, else bail.
2661 if (!isa<PoisonValue>(Src))
2662 return false;
2663 M = PoisonMaskElem;
2664 continue;
2665 }
2666 if (!NewX || NewX == Src) {
2667 NewX = Src;
2668 continue;
2669 }
2670 if (!NewY || NewY == Src) {
2671 M += NumSrcElts;
2672 NewY = Src;
2673 continue;
2674 }
2675 return false;
2676 }
2677 }
2678
2679 if (!NewX)
2680 return PoisonValue::get(ShuffleDstTy);
2681 if (!NewY)
2682 NewY = PoisonValue::get(ShuffleSrcTy);
2683
2684 // Have we folded to an Identity shuffle?
2685 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2686 replaceValue(I, *NewX);
2687 return true;
2688 }
2689
2690 // Try to merge the shuffles if the new shuffle is not costly.
2691 InstructionCost InnerCost0 = 0;
2692 if (Match0)
2693 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2694
2695 InstructionCost InnerCost1 = 0;
2696 if (Match1)
2697 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2698
2700
2701 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2702
2703 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2707 InstructionCost NewCost =
2708 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2709 nullptr, {NewX, NewY});
2710 if (!OuterV0->hasOneUse())
2711 NewCost += InnerCost0;
2712 if (!OuterV1->hasOneUse())
2713 NewCost += InnerCost1;
2714
2715 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2716 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2717 << "\n");
2718 if (NewCost > OldCost)
2719 return false;
2720
2721 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2722 replaceValue(I, *Shuf);
2723 return true;
2724}
2725
2726/// Try to convert
2727/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2728bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2729 Value *V0, *V1;
2730 ArrayRef<int> OldMask;
2731 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2732 m_Mask(OldMask))))
2733 return false;
2734
2735 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2736 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2737 if (!II0 || !II1)
2738 return false;
2739
2740 Intrinsic::ID IID = II0->getIntrinsicID();
2741 if (IID != II1->getIntrinsicID())
2742 return false;
2743
2744 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2745 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2746 if (!ShuffleDstTy || !II0Ty)
2747 return false;
2748
2749 if (!isTriviallyVectorizable(IID))
2750 return false;
2751
2752 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2754 II0->getArgOperand(I) != II1->getArgOperand(I))
2755 return false;
2756
2757 InstructionCost OldCost =
2758 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2759 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2761 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2762
2763 SmallVector<Type *> NewArgsTy;
2764 InstructionCost NewCost = 0;
2765 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2767 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2768 } else {
2769 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2770 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2771 ShuffleDstTy->getNumElements());
2772 NewArgsTy.push_back(ArgTy);
2774 ArgTy, VecTy, OldMask, CostKind);
2775 }
2776 }
2777 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2778 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2779
2780 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2781 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2782 << "\n");
2783
2784 if (NewCost > OldCost)
2785 return false;
2786
2787 SmallVector<Value *> NewArgs;
2788 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2790 NewArgs.push_back(II0->getArgOperand(I));
2791 } else {
2792 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2793 II1->getArgOperand(I), OldMask);
2794 NewArgs.push_back(Shuf);
2795 Worklist.pushValue(Shuf);
2796 }
2797 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2798
2799 // Intersect flags from the old intrinsics.
2800 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2801 NewInst->copyIRFlags(II0);
2802 NewInst->andIRFlags(II1);
2803 }
2804
2805 replaceValue(I, *NewIntrinsic);
2806 return true;
2807}
2808
2809using InstLane = std::pair<Use *, int>;
2810
2811static InstLane lookThroughShuffles(Use *U, int Lane) {
2812 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2813 unsigned NumElts =
2814 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2815 int M = SV->getMaskValue(Lane);
2816 if (M < 0)
2817 return {nullptr, PoisonMaskElem};
2818 if (static_cast<unsigned>(M) < NumElts) {
2819 U = &SV->getOperandUse(0);
2820 Lane = M;
2821 } else {
2822 U = &SV->getOperandUse(1);
2823 Lane = M - NumElts;
2824 }
2825 }
2826 return InstLane{U, Lane};
2827}
2828
2832 for (InstLane IL : Item) {
2833 auto [U, Lane] = IL;
2834 InstLane OpLane =
2835 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2836 Lane)
2837 : InstLane{nullptr, PoisonMaskElem};
2838 NItem.emplace_back(OpLane);
2839 }
2840 return NItem;
2841}
2842
2843/// Detect concat of multiple values into a vector
2845 const TargetTransformInfo &TTI) {
2846 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
2847 unsigned NumElts = Ty->getNumElements();
2848 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
2849 return false;
2850
2851 // Check that the concat is free, usually meaning that the type will be split
2852 // during legalization.
2853 SmallVector<int, 16> ConcatMask(NumElts * 2);
2854 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2855 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
2856 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
2857 Ty, ConcatMask, CostKind) != 0)
2858 return false;
2859
2860 unsigned NumSlices = Item.size() / NumElts;
2861 // Currently we generate a tree of shuffles for the concats, which limits us
2862 // to a power2.
2863 if (!isPowerOf2_32(NumSlices))
2864 return false;
2865 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
2866 Use *SliceV = Item[Slice * NumElts].first;
2867 if (!SliceV || SliceV->get()->getType() != Ty)
2868 return false;
2869 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
2870 auto [V, Lane] = Item[Slice * NumElts + Elt];
2871 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
2872 return false;
2873 }
2874 }
2875 return true;
2876}
2877
2879 const SmallPtrSet<Use *, 4> &IdentityLeafs,
2880 const SmallPtrSet<Use *, 4> &SplatLeafs,
2881 const SmallPtrSet<Use *, 4> &ConcatLeafs,
2882 IRBuilderBase &Builder,
2883 const TargetTransformInfo *TTI) {
2884 auto [FrontU, FrontLane] = Item.front();
2885
2886 if (IdentityLeafs.contains(FrontU)) {
2887 return FrontU->get();
2888 }
2889 if (SplatLeafs.contains(FrontU)) {
2890 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
2891 return Builder.CreateShuffleVector(FrontU->get(), Mask);
2892 }
2893 if (ConcatLeafs.contains(FrontU)) {
2894 unsigned NumElts =
2895 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
2896 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
2897 for (unsigned S = 0; S < Values.size(); ++S)
2898 Values[S] = Item[S * NumElts].first->get();
2899
2900 while (Values.size() > 1) {
2901 NumElts *= 2;
2902 SmallVector<int, 16> Mask(NumElts, 0);
2903 std::iota(Mask.begin(), Mask.end(), 0);
2904 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
2905 for (unsigned S = 0; S < NewValues.size(); ++S)
2906 NewValues[S] =
2907 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
2908 Values = NewValues;
2909 }
2910 return Values[0];
2911 }
2912
2913 auto *I = cast<Instruction>(FrontU->get());
2914 auto *II = dyn_cast<IntrinsicInst>(I);
2915 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
2917 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
2918 if (II &&
2919 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
2920 Ops[Idx] = II->getOperand(Idx);
2921 continue;
2922 }
2924 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
2925 Builder, TTI);
2926 }
2927
2928 SmallVector<Value *, 8> ValueList;
2929 for (const auto &Lane : Item)
2930 if (Lane.first)
2931 ValueList.push_back(Lane.first->get());
2932
2933 Type *DstTy =
2934 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
2935 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
2936 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
2937 Ops[0], Ops[1]);
2938 propagateIRFlags(Value, ValueList);
2939 return Value;
2940 }
2941 if (auto *CI = dyn_cast<CmpInst>(I)) {
2942 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
2943 propagateIRFlags(Value, ValueList);
2944 return Value;
2945 }
2946 if (auto *SI = dyn_cast<SelectInst>(I)) {
2947 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
2948 propagateIRFlags(Value, ValueList);
2949 return Value;
2950 }
2951 if (auto *CI = dyn_cast<CastInst>(I)) {
2952 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
2953 propagateIRFlags(Value, ValueList);
2954 return Value;
2955 }
2956 if (II) {
2957 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
2958 propagateIRFlags(Value, ValueList);
2959 return Value;
2960 }
2961 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
2962 auto *Value =
2963 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
2964 propagateIRFlags(Value, ValueList);
2965 return Value;
2966}
2967
2968// Starting from a shuffle, look up through operands tracking the shuffled index
2969// of each lane. If we can simplify away the shuffles to identities then
2970// do so.
2971bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
2972 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
2973 if (!Ty || I.use_empty())
2974 return false;
2975
2976 SmallVector<InstLane> Start(Ty->getNumElements());
2977 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
2978 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
2979
2981 Worklist.push_back(Start);
2982 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
2983 unsigned NumVisited = 0;
2984
2985 while (!Worklist.empty()) {
2986 if (++NumVisited > MaxInstrsToScan)
2987 return false;
2988
2989 SmallVector<InstLane> Item = Worklist.pop_back_val();
2990 auto [FrontU, FrontLane] = Item.front();
2991
2992 // If we found an undef first lane then bail out to keep things simple.
2993 if (!FrontU)
2994 return false;
2995
2996 // Helper to peek through bitcasts to the same value.
2997 auto IsEquiv = [&](Value *X, Value *Y) {
2998 return X->getType() == Y->getType() &&
3000 };
3001
3002 // Look for an identity value.
3003 if (FrontLane == 0 &&
3004 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3005 Ty->getNumElements() &&
3006 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3007 Value *FrontV = Item.front().first->get();
3008 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3009 E.value().second == (int)E.index());
3010 })) {
3011 IdentityLeafs.insert(FrontU);
3012 continue;
3013 }
3014 // Look for constants, for the moment only supporting constant splats.
3015 if (auto *C = dyn_cast<Constant>(FrontU);
3016 C && C->getSplatValue() &&
3017 all_of(drop_begin(Item), [Item](InstLane &IL) {
3018 Value *FrontV = Item.front().first->get();
3019 Use *U = IL.first;
3020 return !U || (isa<Constant>(U->get()) &&
3021 cast<Constant>(U->get())->getSplatValue() ==
3022 cast<Constant>(FrontV)->getSplatValue());
3023 })) {
3024 SplatLeafs.insert(FrontU);
3025 continue;
3026 }
3027 // Look for a splat value.
3028 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3029 auto [FrontU, FrontLane] = Item.front();
3030 auto [U, Lane] = IL;
3031 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3032 })) {
3033 SplatLeafs.insert(FrontU);
3034 continue;
3035 }
3036
3037 // We need each element to be the same type of value, and check that each
3038 // element has a single use.
3039 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3040 Value *FrontV = Item.front().first->get();
3041 if (!IL.first)
3042 return true;
3043 Value *V = IL.first->get();
3044 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3045 return false;
3046 if (V->getValueID() != FrontV->getValueID())
3047 return false;
3048 if (auto *CI = dyn_cast<CmpInst>(V))
3049 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3050 return false;
3051 if (auto *CI = dyn_cast<CastInst>(V))
3052 if (CI->getSrcTy()->getScalarType() !=
3053 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3054 return false;
3055 if (auto *SI = dyn_cast<SelectInst>(V))
3056 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3057 SI->getOperand(0)->getType() !=
3058 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3059 return false;
3060 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3061 return false;
3062 auto *II = dyn_cast<IntrinsicInst>(V);
3063 return !II || (isa<IntrinsicInst>(FrontV) &&
3064 II->getIntrinsicID() ==
3065 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3066 !II->hasOperandBundles());
3067 };
3068 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3069 // Check the operator is one that we support.
3070 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3071 // We exclude div/rem in case they hit UB from poison lanes.
3072 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3073 BO && BO->isIntDivRem())
3074 return false;
3077 continue;
3078 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3079 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3081 continue;
3082 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3083 // TODO: Handle vector widening/narrowing bitcasts.
3084 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3085 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3086 if (DstTy && SrcTy &&
3087 SrcTy->getNumElements() == DstTy->getNumElements()) {
3089 continue;
3090 }
3091 } else if (isa<SelectInst>(FrontU)) {
3095 continue;
3096 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3097 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3098 !II->hasOperandBundles()) {
3099 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3100 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3101 &TTI)) {
3102 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3103 Value *FrontV = Item.front().first->get();
3104 Use *U = IL.first;
3105 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3106 cast<Instruction>(FrontV)->getOperand(Op));
3107 }))
3108 return false;
3109 continue;
3110 }
3112 }
3113 continue;
3114 }
3115 }
3116
3117 if (isFreeConcat(Item, CostKind, TTI)) {
3118 ConcatLeafs.insert(FrontU);
3119 continue;
3120 }
3121
3122 return false;
3123 }
3124
3125 if (NumVisited <= 1)
3126 return false;
3127
3128 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3129
3130 // If we got this far, we know the shuffles are superfluous and can be
3131 // removed. Scan through again and generate the new tree of instructions.
3132 Builder.SetInsertPoint(&I);
3133 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3134 ConcatLeafs, Builder, &TTI);
3135 replaceValue(I, *V);
3136 return true;
3137}
3138
3139/// Given a commutative reduction, the order of the input lanes does not alter
3140/// the results. We can use this to remove certain shuffles feeding the
3141/// reduction, removing the need to shuffle at all.
3142bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3143 auto *II = dyn_cast<IntrinsicInst>(&I);
3144 if (!II)
3145 return false;
3146 switch (II->getIntrinsicID()) {
3147 case Intrinsic::vector_reduce_add:
3148 case Intrinsic::vector_reduce_mul:
3149 case Intrinsic::vector_reduce_and:
3150 case Intrinsic::vector_reduce_or:
3151 case Intrinsic::vector_reduce_xor:
3152 case Intrinsic::vector_reduce_smin:
3153 case Intrinsic::vector_reduce_smax:
3154 case Intrinsic::vector_reduce_umin:
3155 case Intrinsic::vector_reduce_umax:
3156 break;
3157 default:
3158 return false;
3159 }
3160
3161 // Find all the inputs when looking through operations that do not alter the
3162 // lane order (binops, for example). Currently we look for a single shuffle,
3163 // and can ignore splat values.
3164 std::queue<Value *> Worklist;
3165 SmallPtrSet<Value *, 4> Visited;
3166 ShuffleVectorInst *Shuffle = nullptr;
3167 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3168 Worklist.push(Op);
3169
3170 while (!Worklist.empty()) {
3171 Value *CV = Worklist.front();
3172 Worklist.pop();
3173 if (Visited.contains(CV))
3174 continue;
3175
3176 // Splats don't change the order, so can be safely ignored.
3177 if (isSplatValue(CV))
3178 continue;
3179
3180 Visited.insert(CV);
3181
3182 if (auto *CI = dyn_cast<Instruction>(CV)) {
3183 if (CI->isBinaryOp()) {
3184 for (auto *Op : CI->operand_values())
3185 Worklist.push(Op);
3186 continue;
3187 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3188 if (Shuffle && Shuffle != SV)
3189 return false;
3190 Shuffle = SV;
3191 continue;
3192 }
3193 }
3194
3195 // Anything else is currently an unknown node.
3196 return false;
3197 }
3198
3199 if (!Shuffle)
3200 return false;
3201
3202 // Check all uses of the binary ops and shuffles are also included in the
3203 // lane-invariant operations (Visited should be the list of lanewise
3204 // instructions, including the shuffle that we found).
3205 for (auto *V : Visited)
3206 for (auto *U : V->users())
3207 if (!Visited.contains(U) && U != &I)
3208 return false;
3209
3210 FixedVectorType *VecType =
3211 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3212 if (!VecType)
3213 return false;
3214 FixedVectorType *ShuffleInputType =
3216 if (!ShuffleInputType)
3217 return false;
3218 unsigned NumInputElts = ShuffleInputType->getNumElements();
3219
3220 // Find the mask from sorting the lanes into order. This is most likely to
3221 // become a identity or concat mask. Undef elements are pushed to the end.
3222 SmallVector<int> ConcatMask;
3223 Shuffle->getShuffleMask(ConcatMask);
3224 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3225 bool UsesSecondVec =
3226 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3227
3229 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3230 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3232 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3233 ShuffleInputType, ConcatMask, CostKind);
3234
3235 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3236 << "\n");
3237 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3238 << "\n");
3239 bool MadeChanges = false;
3240 if (NewCost < OldCost) {
3241 Builder.SetInsertPoint(Shuffle);
3242 Value *NewShuffle = Builder.CreateShuffleVector(
3243 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3244 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3245 replaceValue(*Shuffle, *NewShuffle);
3246 return true;
3247 }
3248
3249 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3250 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3251 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3252 return MadeChanges;
3253}
3254
3255/// For a given chain of patterns of the following form:
3256///
3257/// ```
3258/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3259///
3260/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3261/// ty1> %1)
3262/// OR
3263/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3264///
3265/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3266/// ...
3267/// ...
3268/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3269/// 3), <n x ty1> %(i - 2)
3270/// OR
3271/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3272///
3273/// %(i) = extractelement <n x ty1> %(i - 1), 0
3274/// ```
3275///
3276/// Where:
3277/// `mask` follows a partition pattern:
3278///
3279/// Ex:
3280/// [n = 8, p = poison]
3281///
3282/// 4 5 6 7 | p p p p
3283/// 2 3 | p p p p p p
3284/// 1 | p p p p p p p
3285///
3286/// For powers of 2, there's a consistent pattern, but for other cases
3287/// the parity of the current half value at each step decides the
3288/// next partition half (see `ExpectedParityMask` for more logical details
3289/// in generalising this).
3290///
3291/// Ex:
3292/// [n = 6]
3293///
3294/// 3 4 5 | p p p
3295/// 1 2 | p p p p
3296/// 1 | p p p p p
3297bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3298 // Going bottom-up for the pattern.
3299 std::queue<Value *> InstWorklist;
3300 InstructionCost OrigCost = 0;
3301
3302 // Common instruction operation after each shuffle op.
3303 std::optional<unsigned int> CommonCallOp = std::nullopt;
3304 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3305
3306 bool IsFirstCallOrBinInst = true;
3307 bool ShouldBeCallOrBinInst = true;
3308
3309 // This stores the last used instructions for shuffle/common op.
3310 //
3311 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3312 // instructions from either shuffle/common op.
3313 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3314
3315 Value *VecOpEE;
3316 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3317 return false;
3318
3319 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3320 if (!FVT)
3321 return false;
3322
3323 int64_t VecSize = FVT->getNumElements();
3324 if (VecSize < 2)
3325 return false;
3326
3327 // Number of levels would be ~log2(n), considering we always partition
3328 // by half for this fold pattern.
3329 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3330 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3331
3332 // This is how we generalise for all element sizes.
3333 // At each step, if vector size is odd, we need non-poison
3334 // values to cover the dominant half so we don't miss out on any element.
3335 //
3336 // This mask will help us retrieve this as we go from bottom to top:
3337 //
3338 // Mask Set -> N = N * 2 - 1
3339 // Mask Unset -> N = N * 2
3340 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3341 Cur = (Cur + 1) / 2, --Mask) {
3342 if (Cur & 1)
3343 ExpectedParityMask |= (1ll << Mask);
3344 }
3345
3346 InstWorklist.push(VecOpEE);
3347
3348 while (!InstWorklist.empty()) {
3349 Value *CI = InstWorklist.front();
3350 InstWorklist.pop();
3351
3352 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3353 if (!ShouldBeCallOrBinInst)
3354 return false;
3355
3356 if (!IsFirstCallOrBinInst &&
3357 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3358 return false;
3359
3360 // For the first found call/bin op, the vector has to come from the
3361 // extract element op.
3362 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3363 return false;
3364 IsFirstCallOrBinInst = false;
3365
3366 if (!CommonCallOp)
3367 CommonCallOp = II->getIntrinsicID();
3368 if (II->getIntrinsicID() != *CommonCallOp)
3369 return false;
3370
3371 switch (II->getIntrinsicID()) {
3372 case Intrinsic::umin:
3373 case Intrinsic::umax:
3374 case Intrinsic::smin:
3375 case Intrinsic::smax: {
3376 auto *Op0 = II->getOperand(0);
3377 auto *Op1 = II->getOperand(1);
3378 PrevVecV[0] = Op0;
3379 PrevVecV[1] = Op1;
3380 break;
3381 }
3382 default:
3383 return false;
3384 }
3385 ShouldBeCallOrBinInst ^= 1;
3386
3387 IntrinsicCostAttributes ICA(
3388 *CommonCallOp, II->getType(),
3389 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3390 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3391
3392 // We may need a swap here since it can be (a, b) or (b, a)
3393 // and accordingly change as we go up.
3394 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3395 std::swap(PrevVecV[0], PrevVecV[1]);
3396 InstWorklist.push(PrevVecV[1]);
3397 InstWorklist.push(PrevVecV[0]);
3398 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3399 // Similar logic for bin ops.
3400
3401 if (!ShouldBeCallOrBinInst)
3402 return false;
3403
3404 if (!IsFirstCallOrBinInst &&
3405 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3406 return false;
3407
3408 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3409 return false;
3410 IsFirstCallOrBinInst = false;
3411
3412 if (!CommonBinOp)
3413 CommonBinOp = BinOp->getOpcode();
3414
3415 if (BinOp->getOpcode() != *CommonBinOp)
3416 return false;
3417
3418 switch (*CommonBinOp) {
3419 case BinaryOperator::Add:
3420 case BinaryOperator::Mul:
3421 case BinaryOperator::Or:
3422 case BinaryOperator::And:
3423 case BinaryOperator::Xor: {
3424 auto *Op0 = BinOp->getOperand(0);
3425 auto *Op1 = BinOp->getOperand(1);
3426 PrevVecV[0] = Op0;
3427 PrevVecV[1] = Op1;
3428 break;
3429 }
3430 default:
3431 return false;
3432 }
3433 ShouldBeCallOrBinInst ^= 1;
3434
3435 OrigCost +=
3436 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3437
3438 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3439 std::swap(PrevVecV[0], PrevVecV[1]);
3440 InstWorklist.push(PrevVecV[1]);
3441 InstWorklist.push(PrevVecV[0]);
3442 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3443 // We shouldn't have any null values in the previous vectors,
3444 // is so, there was a mismatch in pattern.
3445 if (ShouldBeCallOrBinInst ||
3446 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3447 return false;
3448
3449 if (SVInst != PrevVecV[1])
3450 return false;
3451
3452 ArrayRef<int> CurMask;
3453 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3454 m_Mask(CurMask))))
3455 return false;
3456
3457 // Subtract the parity mask when checking the condition.
3458 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3459 if (Mask < ShuffleMaskHalf &&
3460 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3461 return false;
3462 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3463 return false;
3464 }
3465
3466 // Update mask values.
3467 ShuffleMaskHalf *= 2;
3468 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3469 ExpectedParityMask >>= 1;
3470
3472 SVInst->getType(), SVInst->getType(),
3473 CurMask, CostKind);
3474
3475 VisitedCnt += 1;
3476 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3477 break;
3478
3479 ShouldBeCallOrBinInst ^= 1;
3480 } else {
3481 return false;
3482 }
3483 }
3484
3485 // Pattern should end with a shuffle op.
3486 if (ShouldBeCallOrBinInst)
3487 return false;
3488
3489 assert(VecSize != -1 && "Expected Match for Vector Size");
3490
3491 Value *FinalVecV = PrevVecV[0];
3492 if (!FinalVecV)
3493 return false;
3494
3495 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3496
3497 Intrinsic::ID ReducedOp =
3498 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3499 : getReductionForBinop(*CommonBinOp));
3500 if (!ReducedOp)
3501 return false;
3502
3503 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3505
3506 if (NewCost >= OrigCost)
3507 return false;
3508
3509 auto *ReducedResult =
3510 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3511 replaceValue(I, *ReducedResult);
3512
3513 return true;
3514}
3515
3516/// Determine if its more efficient to fold:
3517/// reduce(trunc(x)) -> trunc(reduce(x)).
3518/// reduce(sext(x)) -> sext(reduce(x)).
3519/// reduce(zext(x)) -> zext(reduce(x)).
3520bool VectorCombine::foldCastFromReductions(Instruction &I) {
3521 auto *II = dyn_cast<IntrinsicInst>(&I);
3522 if (!II)
3523 return false;
3524
3525 bool TruncOnly = false;
3526 Intrinsic::ID IID = II->getIntrinsicID();
3527 switch (IID) {
3528 case Intrinsic::vector_reduce_add:
3529 case Intrinsic::vector_reduce_mul:
3530 TruncOnly = true;
3531 break;
3532 case Intrinsic::vector_reduce_and:
3533 case Intrinsic::vector_reduce_or:
3534 case Intrinsic::vector_reduce_xor:
3535 break;
3536 default:
3537 return false;
3538 }
3539
3540 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3541 Value *ReductionSrc = I.getOperand(0);
3542
3543 Value *Src;
3544 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3545 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3546 return false;
3547
3548 auto CastOpc =
3549 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3550
3551 auto *SrcTy = cast<VectorType>(Src->getType());
3552 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3553 Type *ResultTy = I.getType();
3554
3556 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3557 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3559 cast<CastInst>(ReductionSrc));
3560 InstructionCost NewCost =
3561 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3562 CostKind) +
3563 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3565
3566 if (OldCost <= NewCost || !NewCost.isValid())
3567 return false;
3568
3569 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3570 II->getIntrinsicID(), {Src});
3571 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3572 replaceValue(I, *NewCast);
3573 return true;
3574}
3575
3576/// Returns true if this ShuffleVectorInst eventually feeds into a
3577/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3578/// chains of shuffles and binary operators (in any combination/order).
3579/// The search does not go deeper than the given Depth.
3581 constexpr unsigned MaxVisited = 32;
3584 bool FoundReduction = false;
3585
3586 WorkList.push_back(SVI);
3587 while (!WorkList.empty()) {
3588 Instruction *I = WorkList.pop_back_val();
3589 for (User *U : I->users()) {
3590 auto *UI = cast<Instruction>(U);
3591 if (!UI || !Visited.insert(UI).second)
3592 continue;
3593 if (Visited.size() > MaxVisited)
3594 return false;
3595 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3596 // More than one reduction reached
3597 if (FoundReduction)
3598 return false;
3599 switch (II->getIntrinsicID()) {
3600 case Intrinsic::vector_reduce_add:
3601 case Intrinsic::vector_reduce_mul:
3602 case Intrinsic::vector_reduce_and:
3603 case Intrinsic::vector_reduce_or:
3604 case Intrinsic::vector_reduce_xor:
3605 case Intrinsic::vector_reduce_smin:
3606 case Intrinsic::vector_reduce_smax:
3607 case Intrinsic::vector_reduce_umin:
3608 case Intrinsic::vector_reduce_umax:
3609 FoundReduction = true;
3610 continue;
3611 default:
3612 return false;
3613 }
3614 }
3615
3617 return false;
3618
3619 WorkList.emplace_back(UI);
3620 }
3621 }
3622 return FoundReduction;
3623}
3624
3625/// This method looks for groups of shuffles acting on binops, of the form:
3626/// %x = shuffle ...
3627/// %y = shuffle ...
3628/// %a = binop %x, %y
3629/// %b = binop %x, %y
3630/// shuffle %a, %b, selectmask
3631/// We may, especially if the shuffle is wider than legal, be able to convert
3632/// the shuffle to a form where only parts of a and b need to be computed. On
3633/// architectures with no obvious "select" shuffle, this can reduce the total
3634/// number of operations if the target reports them as cheaper.
3635bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3636 auto *SVI = cast<ShuffleVectorInst>(&I);
3637 auto *VT = cast<FixedVectorType>(I.getType());
3638 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3639 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3640 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3641 VT != Op0->getType())
3642 return false;
3643
3644 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3645 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3646 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3647 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3648 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3649 auto checkSVNonOpUses = [&](Instruction *I) {
3650 if (!I || I->getOperand(0)->getType() != VT)
3651 return true;
3652 return any_of(I->users(), [&](User *U) {
3653 return U != Op0 && U != Op1 &&
3654 !(isa<ShuffleVectorInst>(U) &&
3655 (InputShuffles.contains(cast<Instruction>(U)) ||
3656 isInstructionTriviallyDead(cast<Instruction>(U))));
3657 });
3658 };
3659 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3660 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3661 return false;
3662
3663 // Collect all the uses that are shuffles that we can transform together. We
3664 // may not have a single shuffle, but a group that can all be transformed
3665 // together profitably.
3667 auto collectShuffles = [&](Instruction *I) {
3668 for (auto *U : I->users()) {
3669 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3670 if (!SV || SV->getType() != VT)
3671 return false;
3672 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3673 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3674 return false;
3675 if (!llvm::is_contained(Shuffles, SV))
3676 Shuffles.push_back(SV);
3677 }
3678 return true;
3679 };
3680 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3681 return false;
3682 // From a reduction, we need to be processing a single shuffle, otherwise the
3683 // other uses will not be lane-invariant.
3684 if (FromReduction && Shuffles.size() > 1)
3685 return false;
3686
3687 // Add any shuffle uses for the shuffles we have found, to include them in our
3688 // cost calculations.
3689 if (!FromReduction) {
3690 for (ShuffleVectorInst *SV : Shuffles) {
3691 for (auto *U : SV->users()) {
3692 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3693 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3694 Shuffles.push_back(SSV);
3695 }
3696 }
3697 }
3698
3699 // For each of the output shuffles, we try to sort all the first vector
3700 // elements to the beginning, followed by the second array elements at the
3701 // end. If the binops are legalized to smaller vectors, this may reduce total
3702 // number of binops. We compute the ReconstructMask mask needed to convert
3703 // back to the original lane order.
3705 SmallVector<SmallVector<int>> OrigReconstructMasks;
3706 int MaxV1Elt = 0, MaxV2Elt = 0;
3707 unsigned NumElts = VT->getNumElements();
3708 for (ShuffleVectorInst *SVN : Shuffles) {
3709 SmallVector<int> Mask;
3710 SVN->getShuffleMask(Mask);
3711
3712 // Check the operands are the same as the original, or reversed (in which
3713 // case we need to commute the mask).
3714 Value *SVOp0 = SVN->getOperand(0);
3715 Value *SVOp1 = SVN->getOperand(1);
3716 if (isa<UndefValue>(SVOp1)) {
3717 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3718 SVOp0 = SSV->getOperand(0);
3719 SVOp1 = SSV->getOperand(1);
3720 for (int &Elem : Mask) {
3721 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3722 return false;
3723 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3724 }
3725 }
3726 if (SVOp0 == Op1 && SVOp1 == Op0) {
3727 std::swap(SVOp0, SVOp1);
3729 }
3730 if (SVOp0 != Op0 || SVOp1 != Op1)
3731 return false;
3732
3733 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3734 // take the packed values from Op0/Op1 and reconstructing to the original
3735 // order.
3736 SmallVector<int> ReconstructMask;
3737 for (unsigned I = 0; I < Mask.size(); I++) {
3738 if (Mask[I] < 0) {
3739 ReconstructMask.push_back(-1);
3740 } else if (Mask[I] < static_cast<int>(NumElts)) {
3741 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3742 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3743 return Mask[I] == A.first;
3744 });
3745 if (It != V1.end())
3746 ReconstructMask.push_back(It - V1.begin());
3747 else {
3748 ReconstructMask.push_back(V1.size());
3749 V1.emplace_back(Mask[I], V1.size());
3750 }
3751 } else {
3752 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3753 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3754 return Mask[I] - static_cast<int>(NumElts) == A.first;
3755 });
3756 if (It != V2.end())
3757 ReconstructMask.push_back(NumElts + It - V2.begin());
3758 else {
3759 ReconstructMask.push_back(NumElts + V2.size());
3760 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3761 }
3762 }
3763 }
3764
3765 // For reductions, we know that the lane ordering out doesn't alter the
3766 // result. In-order can help simplify the shuffle away.
3767 if (FromReduction)
3768 sort(ReconstructMask);
3769 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3770 }
3771
3772 // If the Maximum element used from V1 and V2 are not larger than the new
3773 // vectors, the vectors are already packes and performing the optimization
3774 // again will likely not help any further. This also prevents us from getting
3775 // stuck in a cycle in case the costs do not also rule it out.
3776 if (V1.empty() || V2.empty() ||
3777 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3778 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3779 return false;
3780
3781 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3782 // shuffle of another shuffle, or not a shuffle (that is treated like a
3783 // identity shuffle).
3784 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3785 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3786 if (!SV)
3787 return M;
3788 if (isa<UndefValue>(SV->getOperand(1)))
3789 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3790 if (InputShuffles.contains(SSV))
3791 return SSV->getMaskValue(SV->getMaskValue(M));
3792 return SV->getMaskValue(M);
3793 };
3794
3795 // Attempt to sort the inputs my ascending mask values to make simpler input
3796 // shuffles and push complex shuffles down to the uses. We sort on the first
3797 // of the two input shuffle orders, to try and get at least one input into a
3798 // nice order.
3799 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3800 std::pair<int, int> Y) {
3801 int MXA = GetBaseMaskValue(A, X.first);
3802 int MYA = GetBaseMaskValue(A, Y.first);
3803 return MXA < MYA;
3804 };
3805 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3806 return SortBase(SVI0A, A, B);
3807 });
3808 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3809 return SortBase(SVI1A, A, B);
3810 });
3811 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3812 // modified order of the input shuffles.
3813 SmallVector<SmallVector<int>> ReconstructMasks;
3814 for (const auto &Mask : OrigReconstructMasks) {
3815 SmallVector<int> ReconstructMask;
3816 for (int M : Mask) {
3817 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3818 auto It = find_if(V, [M](auto A) { return A.second == M; });
3819 assert(It != V.end() && "Expected all entries in Mask");
3820 return std::distance(V.begin(), It);
3821 };
3822 if (M < 0)
3823 ReconstructMask.push_back(-1);
3824 else if (M < static_cast<int>(NumElts)) {
3825 ReconstructMask.push_back(FindIndex(V1, M));
3826 } else {
3827 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3828 }
3829 }
3830 ReconstructMasks.push_back(std::move(ReconstructMask));
3831 }
3832
3833 // Calculate the masks needed for the new input shuffles, which get padded
3834 // with undef
3835 SmallVector<int> V1A, V1B, V2A, V2B;
3836 for (unsigned I = 0; I < V1.size(); I++) {
3837 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3838 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3839 }
3840 for (unsigned I = 0; I < V2.size(); I++) {
3841 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3842 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3843 }
3844 while (V1A.size() < NumElts) {
3847 }
3848 while (V2A.size() < NumElts) {
3851 }
3852
3853 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
3854 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3855 if (!SV)
3856 return C;
3857 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
3860 VT, VT, SV->getShuffleMask(), CostKind);
3861 };
3862 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3863 return C +
3865 };
3866
3867 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
3868 unsigned MaxVectorSize =
3870 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
3871 if (MaxElementsInVector == 0)
3872 return false;
3873 // When there are multiple shufflevector operations on the same input,
3874 // especially when the vector length is larger than the register size,
3875 // identical shuffle patterns may occur across different groups of elements.
3876 // To avoid overestimating the cost by counting these repeated shuffles more
3877 // than once, we only account for unique shuffle patterns. This adjustment
3878 // prevents inflated costs in the cost model for wide vectors split into
3879 // several register-sized groups.
3880 std::set<SmallVector<int, 4>> UniqueShuffles;
3881 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3882 // Compute the cost for performing the shuffle over the full vector.
3883 auto ShuffleCost =
3885 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
3886 if (NumFullVectors < 2)
3887 return C + ShuffleCost;
3888 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
3889 unsigned NumUniqueGroups = 0;
3890 unsigned NumGroups = Mask.size() / MaxElementsInVector;
3891 // For each group of MaxElementsInVector contiguous elements,
3892 // collect their shuffle pattern and insert into the set of unique patterns.
3893 for (unsigned I = 0; I < NumFullVectors; ++I) {
3894 for (unsigned J = 0; J < MaxElementsInVector; ++J)
3895 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
3896 if (UniqueShuffles.insert(SubShuffle).second)
3897 NumUniqueGroups += 1;
3898 }
3899 return C + ShuffleCost * NumUniqueGroups / NumGroups;
3900 };
3901 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
3902 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3903 if (!SV)
3904 return C;
3905 SmallVector<int, 16> Mask;
3906 SV->getShuffleMask(Mask);
3907 return AddShuffleMaskAdjustedCost(C, Mask);
3908 };
3909 // Check that input consists of ShuffleVectors applied to the same input
3910 auto AllShufflesHaveSameOperands =
3911 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
3912 if (InputShuffles.size() < 2)
3913 return false;
3914 ShuffleVectorInst *FirstSV =
3915 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
3916 if (!FirstSV)
3917 return false;
3918
3919 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
3920 return std::all_of(
3921 std::next(InputShuffles.begin()), InputShuffles.end(),
3922 [&](Instruction *I) {
3923 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
3924 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
3925 });
3926 };
3927
3928 // Get the costs of the shuffles + binops before and after with the new
3929 // shuffle masks.
3930 InstructionCost CostBefore =
3931 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
3932 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
3933 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
3934 InstructionCost(0), AddShuffleCost);
3935 if (AllShufflesHaveSameOperands(InputShuffles)) {
3936 UniqueShuffles.clear();
3937 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3938 InstructionCost(0), AddShuffleAdjustedCost);
3939 } else {
3940 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3941 InstructionCost(0), AddShuffleCost);
3942 }
3943
3944 // The new binops will be unused for lanes past the used shuffle lengths.
3945 // These types attempt to get the correct cost for that from the target.
3946 FixedVectorType *Op0SmallVT =
3947 FixedVectorType::get(VT->getScalarType(), V1.size());
3948 FixedVectorType *Op1SmallVT =
3949 FixedVectorType::get(VT->getScalarType(), V2.size());
3950 InstructionCost CostAfter =
3951 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
3952 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
3953 UniqueShuffles.clear();
3954 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
3955 InstructionCost(0), AddShuffleMaskAdjustedCost);
3956 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
3957 CostAfter +=
3958 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
3959 InstructionCost(0), AddShuffleMaskCost);
3960
3961 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
3962 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
3963 << " vs CostAfter: " << CostAfter << "\n");
3964 if (CostBefore < CostAfter ||
3965 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
3966 return false;
3967
3968 // The cost model has passed, create the new instructions.
3969 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
3970 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3971 if (!SV)
3972 return I;
3973 if (isa<UndefValue>(SV->getOperand(1)))
3974 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3975 if (InputShuffles.contains(SSV))
3976 return SSV->getOperand(Op);
3977 return SV->getOperand(Op);
3978 };
3979 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
3980 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
3981 GetShuffleOperand(SVI0A, 1), V1A);
3982 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
3983 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
3984 GetShuffleOperand(SVI0B, 1), V1B);
3985 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
3986 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
3987 GetShuffleOperand(SVI1A, 1), V2A);
3988 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
3989 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
3990 GetShuffleOperand(SVI1B, 1), V2B);
3991 Builder.SetInsertPoint(Op0);
3992 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
3993 NSV0A, NSV0B);
3994 if (auto *I = dyn_cast<Instruction>(NOp0))
3995 I->copyIRFlags(Op0, true);
3996 Builder.SetInsertPoint(Op1);
3997 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
3998 NSV1A, NSV1B);
3999 if (auto *I = dyn_cast<Instruction>(NOp1))
4000 I->copyIRFlags(Op1, true);
4001
4002 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4003 Builder.SetInsertPoint(Shuffles[S]);
4004 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4005 replaceValue(*Shuffles[S], *NSV, false);
4006 }
4007
4008 Worklist.pushValue(NSV0A);
4009 Worklist.pushValue(NSV0B);
4010 Worklist.pushValue(NSV1A);
4011 Worklist.pushValue(NSV1B);
4012 return true;
4013}
4014
4015/// Check if instruction depends on ZExt and this ZExt can be moved after the
4016/// instruction. Move ZExt if it is profitable. For example:
4017/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4018/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4019/// Cost model calculations takes into account if zext(x) has other users and
4020/// whether it can be propagated through them too.
4021bool VectorCombine::shrinkType(Instruction &I) {
4022 Value *ZExted, *OtherOperand;
4023 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4024 m_Value(OtherOperand))) &&
4025 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4026 return false;
4027
4028 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4029
4030 auto *BigTy = cast<FixedVectorType>(I.getType());
4031 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4032 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4033
4034 if (I.getOpcode() == Instruction::LShr) {
4035 // Check that the shift amount is less than the number of bits in the
4036 // smaller type. Otherwise, the smaller lshr will return a poison value.
4037 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4038 if (ShAmtKB.getMaxValue().uge(BW))
4039 return false;
4040 } else {
4041 // Check that the expression overall uses at most the same number of bits as
4042 // ZExted
4043 KnownBits KB = computeKnownBits(&I, *DL);
4044 if (KB.countMaxActiveBits() > BW)
4045 return false;
4046 }
4047
4048 // Calculate costs of leaving current IR as it is and moving ZExt operation
4049 // later, along with adding truncates if needed
4051 Instruction::ZExt, BigTy, SmallTy,
4052 TargetTransformInfo::CastContextHint::None, CostKind);
4053 InstructionCost CurrentCost = ZExtCost;
4054 InstructionCost ShrinkCost = 0;
4055
4056 // Calculate total cost and check that we can propagate through all ZExt users
4057 for (User *U : ZExtOperand->users()) {
4058 auto *UI = cast<Instruction>(U);
4059 if (UI == &I) {
4060 CurrentCost +=
4061 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4062 ShrinkCost +=
4063 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4064 ShrinkCost += ZExtCost;
4065 continue;
4066 }
4067
4068 if (!Instruction::isBinaryOp(UI->getOpcode()))
4069 return false;
4070
4071 // Check if we can propagate ZExt through its other users
4072 KnownBits KB = computeKnownBits(UI, *DL);
4073 if (KB.countMaxActiveBits() > BW)
4074 return false;
4075
4076 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4077 ShrinkCost +=
4078 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4079 ShrinkCost += ZExtCost;
4080 }
4081
4082 // If the other instruction operand is not a constant, we'll need to
4083 // generate a truncate instruction. So we have to adjust cost
4084 if (!isa<Constant>(OtherOperand))
4085 ShrinkCost += TTI.getCastInstrCost(
4086 Instruction::Trunc, SmallTy, BigTy,
4087 TargetTransformInfo::CastContextHint::None, CostKind);
4088
4089 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4090 // towards modifying the IR because shrinking opens opportunities for other
4091 // shrinking optimisations.
4092 if (ShrinkCost > CurrentCost)
4093 return false;
4094
4095 Builder.SetInsertPoint(&I);
4096 Value *Op0 = ZExted;
4097 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4098 // Keep the order of operands the same
4099 if (I.getOperand(0) == OtherOperand)
4100 std::swap(Op0, Op1);
4101 Value *NewBinOp =
4102 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4103 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4104 cast<Instruction>(NewBinOp)->copyMetadata(I);
4105 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4106 replaceValue(I, *NewZExtr);
4107 return true;
4108}
4109
4110/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4111/// shuffle (DstVec, SrcVec, Mask)
4112bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4113 Value *DstVec, *SrcVec;
4114 uint64_t ExtIdx, InsIdx;
4115 if (!match(&I,
4116 m_InsertElt(m_Value(DstVec),
4117 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4118 m_ConstantInt(InsIdx))))
4119 return false;
4120
4121 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4122 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4123 // We can try combining vectors with different element sizes.
4124 if (!DstVecTy || !SrcVecTy ||
4125 SrcVecTy->getElementType() != DstVecTy->getElementType())
4126 return false;
4127
4128 unsigned NumDstElts = DstVecTy->getNumElements();
4129 unsigned NumSrcElts = SrcVecTy->getNumElements();
4130 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4131 return false;
4132
4133 // Insertion into poison is a cheaper single operand shuffle.
4135 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4136
4137 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4138 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4139 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4140 if (NeedDstSrcSwap) {
4142 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4143 Mask[InsIdx] = 0;
4144 else
4145 Mask[InsIdx] = ExtIdx;
4146 std::swap(DstVec, SrcVec);
4147 } else {
4149 std::iota(Mask.begin(), Mask.end(), 0);
4150 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4151 Mask[InsIdx] = NumDstElts;
4152 else
4153 Mask[InsIdx] = ExtIdx + NumDstElts;
4154 }
4155
4156 // Cost
4157 auto *Ins = cast<InsertElementInst>(&I);
4158 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4159 InstructionCost InsCost =
4160 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4161 InstructionCost ExtCost =
4162 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4163 InstructionCost OldCost = ExtCost + InsCost;
4164
4165 InstructionCost NewCost = 0;
4166 SmallVector<int> ExtToVecMask;
4167 if (!NeedExpOrNarrow) {
4168 // Ignore 'free' identity insertion shuffle.
4169 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4170 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4171 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4172 nullptr, {DstVec, SrcVec});
4173 } else {
4174 // When creating length-changing-vector, always create with a Mask whose
4175 // first element has an ExtIdx, so that the first element of the vector
4176 // being created is always the target to be extracted.
4177 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4178 if (IsExtIdxInBounds)
4179 ExtToVecMask[ExtIdx] = ExtIdx;
4180 else
4181 ExtToVecMask[0] = ExtIdx;
4182 // Add cost for expanding or narrowing
4184 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4185 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4186 }
4187
4188 if (!Ext->hasOneUse())
4189 NewCost += ExtCost;
4190
4191 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4192 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4193 << "\n");
4194
4195 if (OldCost < NewCost)
4196 return false;
4197
4198 if (NeedExpOrNarrow) {
4199 if (!NeedDstSrcSwap)
4200 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4201 else
4202 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4203 }
4204
4205 // Canonicalize undef param to RHS to help further folds.
4206 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4207 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4208 std::swap(DstVec, SrcVec);
4209 }
4210
4211 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4212 replaceValue(I, *Shuf);
4213
4214 return true;
4215}
4216
4217/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4218/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4219/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4220/// before casting it back into `<vscale x 16 x i32>`.
4221bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4222 const APInt *SplatVal0, *SplatVal1;
4224 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4225 return false;
4226
4227 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4228 << "\n");
4229
4230 auto *VTy =
4231 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4232 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4233 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4234
4235 // Just in case the cost of interleave2 intrinsic and bitcast are both
4236 // invalid, in which case we want to bail out, we use <= rather
4237 // than < here. Even they both have valid and equal costs, it's probably
4238 // not a good idea to emit a high-cost constant splat.
4240 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4242 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4243 << *I.getType() << " is too high.\n");
4244 return false;
4245 }
4246
4247 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4248 NewSplatVal <<= Width;
4249 NewSplatVal |= SplatVal0->zext(Width * 2);
4250 auto *NewSplat = ConstantVector::getSplat(
4251 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4252
4253 IRBuilder<> Builder(&I);
4254 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4255 return true;
4256}
4257
4258// Attempt to shrink loads that are only used by shufflevector instructions.
4259bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4260 auto *OldLoad = dyn_cast<LoadInst>(&I);
4261 if (!OldLoad || !OldLoad->isSimple())
4262 return false;
4263
4264 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4265 if (!OldLoadTy)
4266 return false;
4267
4268 unsigned const OldNumElements = OldLoadTy->getNumElements();
4269
4270 // Search all uses of load. If all uses are shufflevector instructions, and
4271 // the second operands are all poison values, find the minimum and maximum
4272 // indices of the vector elements referenced by all shuffle masks.
4273 // Otherwise return `std::nullopt`.
4274 using IndexRange = std::pair<int, int>;
4275 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4276 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4277 for (llvm::Use &Use : I.uses()) {
4278 // Ensure all uses match the required pattern.
4279 User *Shuffle = Use.getUser();
4280 ArrayRef<int> Mask;
4281
4282 if (!match(Shuffle,
4283 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4284 return std::nullopt;
4285
4286 // Ignore shufflevector instructions that have no uses.
4287 if (Shuffle->use_empty())
4288 continue;
4289
4290 // Find the min and max indices used by the shufflevector instruction.
4291 for (int Index : Mask) {
4292 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4293 OutputRange.first = std::min(Index, OutputRange.first);
4294 OutputRange.second = std::max(Index, OutputRange.second);
4295 }
4296 }
4297 }
4298
4299 if (OutputRange.second < OutputRange.first)
4300 return std::nullopt;
4301
4302 return OutputRange;
4303 };
4304
4305 // Get the range of vector elements used by shufflevector instructions.
4306 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4307 unsigned const NewNumElements = Indices->second + 1u;
4308
4309 // If the range of vector elements is smaller than the full load, attempt
4310 // to create a smaller load.
4311 if (NewNumElements < OldNumElements) {
4312 IRBuilder Builder(&I);
4313 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4314
4315 // Calculate costs of old and new ops.
4316 Type *ElemTy = OldLoadTy->getElementType();
4317 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4318 Value *PtrOp = OldLoad->getPointerOperand();
4319
4321 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4322 OldLoad->getPointerAddressSpace(), CostKind);
4323 InstructionCost NewCost =
4324 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4325 OldLoad->getPointerAddressSpace(), CostKind);
4326
4327 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4329 unsigned const MaxIndex = NewNumElements * 2u;
4330
4331 for (llvm::Use &Use : I.uses()) {
4332 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4333 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4334
4335 // Create entry for new use.
4336 NewUses.push_back({Shuffle, OldMask});
4337
4338 // Validate mask indices.
4339 for (int Index : OldMask) {
4340 if (Index >= static_cast<int>(MaxIndex))
4341 return false;
4342 }
4343
4344 // Update costs.
4345 OldCost +=
4347 OldLoadTy, OldMask, CostKind);
4348 NewCost +=
4350 NewLoadTy, OldMask, CostKind);
4351 }
4352
4353 LLVM_DEBUG(
4354 dbgs() << "Found a load used only by shufflevector instructions: "
4355 << I << "\n OldCost: " << OldCost
4356 << " vs NewCost: " << NewCost << "\n");
4357
4358 if (OldCost < NewCost || !NewCost.isValid())
4359 return false;
4360
4361 // Create new load of smaller vector.
4362 auto *NewLoad = cast<LoadInst>(
4363 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4364 NewLoad->copyMetadata(I);
4365
4366 // Replace all uses.
4367 for (UseEntry &Use : NewUses) {
4368 ShuffleVectorInst *Shuffle = Use.first;
4369 std::vector<int> &NewMask = Use.second;
4370
4371 Builder.SetInsertPoint(Shuffle);
4372 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4373 Value *NewShuffle = Builder.CreateShuffleVector(
4374 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4375
4376 replaceValue(*Shuffle, *NewShuffle, false);
4377 }
4378
4379 return true;
4380 }
4381 }
4382 return false;
4383}
4384
4385// Attempt to narrow a phi of shufflevector instructions where the two incoming
4386// values have the same operands but different masks. If the two shuffle masks
4387// are offsets of one another we can use one branch to rotate the incoming
4388// vector and perform one larger shuffle after the phi.
4389bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4390 auto *Phi = dyn_cast<PHINode>(&I);
4391 if (!Phi || Phi->getNumIncomingValues() != 2u)
4392 return false;
4393
4394 Value *Op = nullptr;
4395 ArrayRef<int> Mask0;
4396 ArrayRef<int> Mask1;
4397
4398 if (!match(Phi->getOperand(0u),
4399 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4400 !match(Phi->getOperand(1u),
4401 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4402 return false;
4403
4404 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4405
4406 // Ensure result vectors are wider than the argument vector.
4407 auto *InputVT = cast<FixedVectorType>(Op->getType());
4408 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4409 auto const InputNumElements = InputVT->getNumElements();
4410
4411 if (InputNumElements >= ResultVT->getNumElements())
4412 return false;
4413
4414 // Take the difference of the two shuffle masks at each index. Ignore poison
4415 // values at the same index in both masks.
4416 SmallVector<int, 16> NewMask;
4417 NewMask.reserve(Mask0.size());
4418
4419 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4420 if (M0 >= 0 && M1 >= 0)
4421 NewMask.push_back(M0 - M1);
4422 else if (M0 == -1 && M1 == -1)
4423 continue;
4424 else
4425 return false;
4426 }
4427
4428 // Ensure all elements of the new mask are equal. If the difference between
4429 // the incoming mask elements is the same, the two must be constant offsets
4430 // of one another.
4431 if (NewMask.empty() || !all_equal(NewMask))
4432 return false;
4433
4434 // Create new mask using difference of the two incoming masks.
4435 int MaskOffset = NewMask[0u];
4436 unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
4437 NewMask.clear();
4438
4439 for (unsigned I = 0u; I < InputNumElements; ++I) {
4440 NewMask.push_back(Index);
4441 Index = (Index + 1u) % InputNumElements;
4442 }
4443
4444 // Calculate costs for worst cases and compare.
4445 auto const Kind = TTI::SK_PermuteSingleSrc;
4446 auto OldCost =
4447 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4448 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4449 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4450 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4451
4452 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4453 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4454 << "\n");
4455
4456 if (NewCost > OldCost)
4457 return false;
4458
4459 // Create new shuffles and narrowed phi.
4460 auto Builder = IRBuilder(Shuf);
4461 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4462 auto *PoisonVal = PoisonValue::get(InputVT);
4463 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4464 Worklist.push(cast<Instruction>(NewShuf0));
4465
4466 Builder.SetInsertPoint(Phi);
4467 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4468 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4469 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4470 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4471
4472 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4473 PoisonVal = PoisonValue::get(NewPhi->getType());
4474 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4475
4476 replaceValue(*Phi, *NewShuf1);
4477 return true;
4478}
4479
4480/// This is the entry point for all transforms. Pass manager differences are
4481/// handled in the callers of this function.
4482bool VectorCombine::run() {
4484 return false;
4485
4486 // Don't attempt vectorization if the target does not support vectors.
4487 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4488 return false;
4489
4490 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4491
4492 auto FoldInst = [this](Instruction &I) {
4493 Builder.SetInsertPoint(&I);
4494 bool IsVectorType = isa<VectorType>(I.getType());
4495 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4496 auto Opcode = I.getOpcode();
4497
4498 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4499
4500 // These folds should be beneficial regardless of when this pass is run
4501 // in the optimization pipeline.
4502 // The type checking is for run-time efficiency. We can avoid wasting time
4503 // dispatching to folding functions if there's no chance of matching.
4504 if (IsFixedVectorType) {
4505 switch (Opcode) {
4506 case Instruction::InsertElement:
4507 if (vectorizeLoadInsert(I))
4508 return true;
4509 break;
4510 case Instruction::ShuffleVector:
4511 if (widenSubvectorLoad(I))
4512 return true;
4513 break;
4514 default:
4515 break;
4516 }
4517 }
4518
4519 // This transform works with scalable and fixed vectors
4520 // TODO: Identify and allow other scalable transforms
4521 if (IsVectorType) {
4522 if (scalarizeOpOrCmp(I))
4523 return true;
4524 if (scalarizeLoadExtract(I))
4525 return true;
4526 if (scalarizeExtExtract(I))
4527 return true;
4528 if (scalarizeVPIntrinsic(I))
4529 return true;
4530 if (foldInterleaveIntrinsics(I))
4531 return true;
4532 }
4533
4534 if (Opcode == Instruction::Store)
4535 if (foldSingleElementStore(I))
4536 return true;
4537
4538 // If this is an early pipeline invocation of this pass, we are done.
4539 if (TryEarlyFoldsOnly)
4540 return false;
4541
4542 // Otherwise, try folds that improve codegen but may interfere with
4543 // early IR canonicalizations.
4544 // The type checking is for run-time efficiency. We can avoid wasting time
4545 // dispatching to folding functions if there's no chance of matching.
4546 if (IsFixedVectorType) {
4547 switch (Opcode) {
4548 case Instruction::InsertElement:
4549 if (foldInsExtFNeg(I))
4550 return true;
4551 if (foldInsExtBinop(I))
4552 return true;
4553 if (foldInsExtVectorToShuffle(I))
4554 return true;
4555 break;
4556 case Instruction::ShuffleVector:
4557 if (foldPermuteOfBinops(I))
4558 return true;
4559 if (foldShuffleOfBinops(I))
4560 return true;
4561 if (foldShuffleOfSelects(I))
4562 return true;
4563 if (foldShuffleOfCastops(I))
4564 return true;
4565 if (foldShuffleOfShuffles(I))
4566 return true;
4567 if (foldShuffleOfIntrinsics(I))
4568 return true;
4569 if (foldSelectShuffle(I))
4570 return true;
4571 if (foldShuffleToIdentity(I))
4572 return true;
4573 break;
4574 case Instruction::Load:
4575 if (shrinkLoadForShuffles(I))
4576 return true;
4577 break;
4578 case Instruction::BitCast:
4579 if (foldBitcastShuffle(I))
4580 return true;
4581 break;
4582 case Instruction::And:
4583 case Instruction::Or:
4584 case Instruction::Xor:
4585 if (foldBitOpOfCastops(I))
4586 return true;
4587 if (foldBitOpOfCastConstant(I))
4588 return true;
4589 break;
4590 case Instruction::PHI:
4591 if (shrinkPhiOfShuffles(I))
4592 return true;
4593 break;
4594 default:
4595 if (shrinkType(I))
4596 return true;
4597 break;
4598 }
4599 } else {
4600 switch (Opcode) {
4601 case Instruction::Call:
4602 if (foldShuffleFromReductions(I))
4603 return true;
4604 if (foldCastFromReductions(I))
4605 return true;
4606 break;
4607 case Instruction::ExtractElement:
4608 if (foldShuffleChainsToReduce(I))
4609 return true;
4610 break;
4611 case Instruction::ICmp:
4612 case Instruction::FCmp:
4613 if (foldExtractExtract(I))
4614 return true;
4615 break;
4616 case Instruction::Or:
4617 if (foldConcatOfBoolMasks(I))
4618 return true;
4619 [[fallthrough]];
4620 default:
4621 if (Instruction::isBinaryOp(Opcode)) {
4622 if (foldExtractExtract(I))
4623 return true;
4624 if (foldExtractedCmps(I))
4625 return true;
4626 if (foldBinopOfReductions(I))
4627 return true;
4628 }
4629 break;
4630 }
4631 }
4632 return false;
4633 };
4634
4635 bool MadeChange = false;
4636 for (BasicBlock &BB : F) {
4637 // Ignore unreachable basic blocks.
4638 if (!DT.isReachableFromEntry(&BB))
4639 continue;
4640 // Use early increment range so that we can erase instructions in loop.
4641 // make_early_inc_range is not applicable here, as the next iterator may
4642 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4643 // We manually maintain the next instruction and update it when it is about
4644 // to be deleted.
4645 Instruction *I = &BB.front();
4646 while (I) {
4647 NextInst = I->getNextNode();
4648 if (!I->isDebugOrPseudoInst())
4649 MadeChange |= FoldInst(*I);
4650 I = NextInst;
4651 }
4652 }
4653
4654 NextInst = nullptr;
4655
4656 while (!Worklist.isEmpty()) {
4657 Instruction *I = Worklist.removeOne();
4658 if (!I)
4659 continue;
4660
4663 continue;
4664 }
4665
4666 MadeChange |= FoldInst(*I);
4667 }
4668
4669 return MadeChange;
4670}
4671
4674 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4676 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4677 AAResults &AA = FAM.getResult<AAManager>(F);
4678 const DataLayout *DL = &F.getDataLayout();
4679 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4680 TryEarlyFoldsOnly);
4681 if (!Combiner.run())
4682 return PreservedAnalyses::all();
4685 return PA;
4686}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1451
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
bool isFPPredicate() const
Definition InstrTypes.h:784
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2559
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1864
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2637
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2238
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1931
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2263
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2494
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1860
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1795
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:359
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:311
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:824
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2047
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2461
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:627
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:416
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1721
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1633
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:431
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1747
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1886
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2097
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:289
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:138