LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlanPtr &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan->getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
95 Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 VPIRMetadata(*Store), Ingredient.getDebugLoc());
102 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
103 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
104 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
105 if (VectorID == Intrinsic::not_intrinsic)
106 return false;
107 NewRecipe = new VPWidenIntrinsicRecipe(
108 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
109 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
110 CI->getDebugLoc());
111 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
112 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
113 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
114 NewRecipe = new VPWidenCastRecipe(
115 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
116 } else {
117 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
118 }
119 }
120
121 NewRecipe->insertBefore(&Ingredient);
122 if (NewRecipe->getNumDefinedValues() == 1)
123 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
124 else
125 assert(NewRecipe->getNumDefinedValues() == 0 &&
126 "Only recpies with zero or one defined values expected");
127 Ingredient.eraseFromParent();
128 }
129 }
130 return true;
131}
132
133static bool sinkScalarOperands(VPlan &Plan) {
134 auto Iter = vp_depth_first_deep(Plan.getEntry());
135 bool Changed = false;
136 // First, collect the operands of all recipes in replicate blocks as seeds for
137 // sinking.
140 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
141 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
142 continue;
143 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
144 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
145 continue;
146 for (auto &Recipe : *VPBB) {
147 for (VPValue *Op : Recipe.operands())
148 if (auto *Def =
149 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
150 WorkList.insert({VPBB, Def});
151 }
152 }
153
154 bool ScalarVFOnly = Plan.hasScalarVFOnly();
155 // Try to sink each replicate or scalar IV steps recipe in the worklist.
156 for (unsigned I = 0; I != WorkList.size(); ++I) {
157 VPBasicBlock *SinkTo;
158 VPSingleDefRecipe *SinkCandidate;
159 std::tie(SinkTo, SinkCandidate) = WorkList[I];
160 if (SinkCandidate->getParent() == SinkTo ||
161 SinkCandidate->mayHaveSideEffects() ||
162 SinkCandidate->mayReadOrWriteMemory())
163 continue;
164 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
165 if (!ScalarVFOnly && RepR->isSingleScalar())
166 continue;
167 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
168 continue;
169
170 bool NeedsDuplicating = false;
171 // All recipe users of the sink candidate must be in the same block SinkTo
172 // or all users outside of SinkTo must be uniform-after-vectorization (
173 // i.e., only first lane is used) . In the latter case, we need to duplicate
174 // SinkCandidate.
175 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
176 SinkCandidate](VPUser *U) {
177 auto *UI = cast<VPRecipeBase>(U);
178 if (UI->getParent() == SinkTo)
179 return true;
180 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
181 // We only know how to duplicate VPReplicateRecipes and
182 // VPScalarIVStepsRecipes for now.
183 return NeedsDuplicating &&
185 };
186 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
187 continue;
188
189 if (NeedsDuplicating) {
190 if (ScalarVFOnly)
191 continue;
192 VPSingleDefRecipe *Clone;
193 if (auto *SinkCandidateRepR =
194 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
195 // TODO: Handle converting to uniform recipes as separate transform,
196 // then cloning should be sufficient here.
197 Instruction *I = SinkCandidate->getUnderlyingInstr();
198 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
199 nullptr /*Mask*/, *SinkCandidateRepR);
200 // TODO: add ".cloned" suffix to name of Clone's VPValue.
201 } else {
202 Clone = SinkCandidate->clone();
203 }
204
205 Clone->insertBefore(SinkCandidate);
206 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
207 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
208 });
209 }
210 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
211 for (VPValue *Op : SinkCandidate->operands())
212 if (auto *Def =
213 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
214 WorkList.insert({SinkTo, Def});
215 Changed = true;
216 }
217 return Changed;
218}
219
220/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
221/// the mask.
223 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
224 if (!EntryBB || EntryBB->size() != 1 ||
225 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
226 return nullptr;
227
228 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
229}
230
231/// If \p R is a triangle region, return the 'then' block of the triangle.
233 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
234 if (EntryBB->getNumSuccessors() != 2)
235 return nullptr;
236
237 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
238 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
239 if (!Succ0 || !Succ1)
240 return nullptr;
241
242 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
243 return nullptr;
244 if (Succ0->getSingleSuccessor() == Succ1)
245 return Succ0;
246 if (Succ1->getSingleSuccessor() == Succ0)
247 return Succ1;
248 return nullptr;
249}
250
251// Merge replicate regions in their successor region, if a replicate region
252// is connected to a successor replicate region with the same predicate by a
253// single, empty VPBasicBlock.
255 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
256
257 // Collect replicate regions followed by an empty block, followed by another
258 // replicate region with matching masks to process front. This is to avoid
259 // iterator invalidation issues while merging regions.
262 vp_depth_first_deep(Plan.getEntry()))) {
263 if (!Region1->isReplicator())
264 continue;
265 auto *MiddleBasicBlock =
266 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
267 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
268 continue;
269
270 auto *Region2 =
271 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
272 if (!Region2 || !Region2->isReplicator())
273 continue;
274
275 VPValue *Mask1 = getPredicatedMask(Region1);
276 VPValue *Mask2 = getPredicatedMask(Region2);
277 if (!Mask1 || Mask1 != Mask2)
278 continue;
279
280 assert(Mask1 && Mask2 && "both region must have conditions");
281 WorkList.push_back(Region1);
282 }
283
284 // Move recipes from Region1 to its successor region, if both are triangles.
285 for (VPRegionBlock *Region1 : WorkList) {
286 if (TransformedRegions.contains(Region1))
287 continue;
288 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
289 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
290
291 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
292 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
293 if (!Then1 || !Then2)
294 continue;
295
296 // Note: No fusion-preventing memory dependencies are expected in either
297 // region. Such dependencies should be rejected during earlier dependence
298 // checks, which guarantee accesses can be re-ordered for vectorization.
299 //
300 // Move recipes to the successor region.
301 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
302 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
303
304 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
305 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
306
307 // Move VPPredInstPHIRecipes from the merge block to the successor region's
308 // merge block. Update all users inside the successor region to use the
309 // original values.
310 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
311 VPValue *PredInst1 =
312 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
313 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
314 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
315 return cast<VPRecipeBase>(&U)->getParent() == Then2;
316 });
317
318 // Remove phi recipes that are unused after merging the regions.
319 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
320 Phi1ToMove.eraseFromParent();
321 continue;
322 }
323 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
324 }
325
326 // Remove the dead recipes in Region1's entry block.
327 for (VPRecipeBase &R :
328 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
329 R.eraseFromParent();
330
331 // Finally, remove the first region.
332 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
333 VPBlockUtils::disconnectBlocks(Pred, Region1);
334 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
335 }
336 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
337 TransformedRegions.insert(Region1);
338 }
339
340 return !TransformedRegions.empty();
341}
342
344 VPlan &Plan) {
345 Instruction *Instr = PredRecipe->getUnderlyingInstr();
346 // Build the triangular if-then region.
347 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
348 assert(Instr->getParent() && "Predicated instruction not in any basic block");
349 auto *BlockInMask = PredRecipe->getMask();
350 auto *MaskDef = BlockInMask->getDefiningRecipe();
351 auto *BOMRecipe = new VPBranchOnMaskRecipe(
352 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
353 auto *Entry =
354 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
355
356 // Replace predicated replicate recipe with a replicate recipe without a
357 // mask but in the replicate region.
358 auto *RecipeWithoutMask = new VPReplicateRecipe(
359 PredRecipe->getUnderlyingInstr(),
360 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
361 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
362 auto *Pred =
363 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
364
365 VPPredInstPHIRecipe *PHIRecipe = nullptr;
366 if (PredRecipe->getNumUsers() != 0) {
367 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
368 RecipeWithoutMask->getDebugLoc());
369 PredRecipe->replaceAllUsesWith(PHIRecipe);
370 PHIRecipe->setOperand(0, RecipeWithoutMask);
371 }
372 PredRecipe->eraseFromParent();
373 auto *Exiting =
374 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
376 Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
377
378 // Note: first set Entry as region entry and then connect successors starting
379 // from it in order, to propagate the "parent" of each VPBasicBlock.
380 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
381 VPBlockUtils::connectBlocks(Pred, Exiting);
382
383 return Region;
384}
385
386static void addReplicateRegions(VPlan &Plan) {
389 vp_depth_first_deep(Plan.getEntry()))) {
390 for (VPRecipeBase &R : *VPBB)
391 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
392 if (RepR->isPredicated())
393 WorkList.push_back(RepR);
394 }
395 }
396
397 unsigned BBNum = 0;
398 for (VPReplicateRecipe *RepR : WorkList) {
399 VPBasicBlock *CurrentBlock = RepR->getParent();
400 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
401
402 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
403 SplitBlock->setName(
404 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
405 // Record predicated instructions for above packing optimizations.
407 Region->setParent(CurrentBlock->getParent());
409
410 VPRegionBlock *ParentRegion = Region->getParent();
411 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
412 ParentRegion->setExiting(SplitBlock);
413 }
414}
415
416/// Remove redundant VPBasicBlocks by merging them into their predecessor if
417/// the predecessor has a single successor.
421 vp_depth_first_deep(Plan.getEntry()))) {
422 // Don't fold the blocks in the skeleton of the Plan into their single
423 // predecessors for now.
424 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
425 if (!VPBB->getParent())
426 continue;
427 auto *PredVPBB =
428 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
429 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
430 isa<VPIRBasicBlock>(PredVPBB))
431 continue;
432 WorkList.push_back(VPBB);
433 }
434
435 for (VPBasicBlock *VPBB : WorkList) {
436 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
437 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
438 R.moveBefore(*PredVPBB, PredVPBB->end());
439 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
440 auto *ParentRegion = VPBB->getParent();
441 if (ParentRegion && ParentRegion->getExiting() == VPBB)
442 ParentRegion->setExiting(PredVPBB);
443 for (auto *Succ : to_vector(VPBB->successors())) {
445 VPBlockUtils::connectBlocks(PredVPBB, Succ);
446 }
447 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
448 }
449 return !WorkList.empty();
450}
451
453 // Convert masked VPReplicateRecipes to if-then region blocks.
455
456 bool ShouldSimplify = true;
457 while (ShouldSimplify) {
458 ShouldSimplify = sinkScalarOperands(Plan);
459 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
460 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
461 }
462}
463
464/// Remove redundant casts of inductions.
465///
466/// Such redundant casts are casts of induction variables that can be ignored,
467/// because we already proved that the casted phi is equal to the uncasted phi
468/// in the vectorized loop. There is no need to vectorize the cast - the same
469/// value can be used for both the phi and casts in the vector loop.
471 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
473 if (!IV || IV->getTruncInst())
474 continue;
475
476 // A sequence of IR Casts has potentially been recorded for IV, which
477 // *must be bypassed* when the IV is vectorized, because the vectorized IV
478 // will produce the desired casted value. This sequence forms a def-use
479 // chain and is provided in reverse order, ending with the cast that uses
480 // the IV phi. Search for the recipe of the last cast in the chain and
481 // replace it with the original IV. Note that only the final cast is
482 // expected to have users outside the cast-chain and the dead casts left
483 // over will be cleaned up later.
484 auto &Casts = IV->getInductionDescriptor().getCastInsts();
485 VPValue *FindMyCast = IV;
486 for (Instruction *IRCast : reverse(Casts)) {
487 VPSingleDefRecipe *FoundUserCast = nullptr;
488 for (auto *U : FindMyCast->users()) {
489 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
490 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
491 FoundUserCast = UserCast;
492 break;
493 }
494 }
495 FindMyCast = FoundUserCast;
496 }
497 FindMyCast->replaceAllUsesWith(IV);
498 }
499}
500
501/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
502/// recipe, if it exists.
504 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
505 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
506 for (VPUser *U : CanonicalIV->users()) {
508 if (WidenNewIV)
509 break;
510 }
511
512 if (!WidenNewIV)
513 return;
514
516 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
517 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
518
519 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
520 continue;
521
522 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
523 // everything WidenNewIV's users need. That is, WidenOriginalIV will
524 // generate a vector phi or all users of WidenNewIV demand the first lane
525 // only.
526 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
527 vputils::onlyFirstLaneUsed(WidenNewIV)) {
528 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
529 WidenNewIV->eraseFromParent();
530 return;
531 }
532 }
533}
534
535/// Returns true if \p R is dead and can be removed.
536static bool isDeadRecipe(VPRecipeBase &R) {
537 // Do remove conditional assume instructions as their conditions may be
538 // flattened.
539 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
540 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
542 if (IsConditionalAssume)
543 return true;
544
545 if (R.mayHaveSideEffects())
546 return false;
547
548 // Recipe is dead if no user keeps the recipe alive.
549 return all_of(R.definedValues(),
550 [](VPValue *V) { return V->getNumUsers() == 0; });
551}
552
555 vp_post_order_deep(Plan.getEntry()))) {
556 // The recipes in the block are processed in reverse order, to catch chains
557 // of dead recipes.
558 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
559 if (isDeadRecipe(R)) {
560 R.eraseFromParent();
561 continue;
562 }
563
564 // Check if R is a dead VPPhi <-> update cycle and remove it.
565 auto *PhiR = dyn_cast<VPPhi>(&R);
566 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
567 continue;
568 VPValue *Incoming = PhiR->getOperand(1);
569 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
570 Incoming->getNumUsers() != 1)
571 continue;
572 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
573 PhiR->eraseFromParent();
574 Incoming->getDefiningRecipe()->eraseFromParent();
575 }
576 }
577}
578
581 Instruction::BinaryOps InductionOpcode,
582 FPMathOperator *FPBinOp, Instruction *TruncI,
583 VPValue *StartV, VPValue *Step, DebugLoc DL,
584 VPBuilder &Builder) {
586 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
587 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
588 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
589
590 // Truncate base induction if needed.
591 VPTypeAnalysis TypeInfo(Plan);
592 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
593 if (TruncI) {
594 Type *TruncTy = TruncI->getType();
595 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
596 "Not truncating.");
597 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
598 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
599 ResultTy = TruncTy;
600 }
601
602 // Truncate step if needed.
603 Type *StepTy = TypeInfo.inferScalarType(Step);
604 if (ResultTy != StepTy) {
605 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
606 "Not truncating.");
607 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
608 auto *VecPreheader =
610 VPBuilder::InsertPointGuard Guard(Builder);
611 Builder.setInsertPoint(VecPreheader);
612 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
613 }
614 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
615 &Plan.getVF(), DL);
616}
617
620 for (unsigned I = 0; I != Users.size(); ++I) {
622 if (isa<VPHeaderPHIRecipe>(Cur))
623 continue;
624 for (VPValue *V : Cur->definedValues())
625 Users.insert_range(V->users());
626 }
627 return Users.takeVector();
628}
629
630/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
631/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
632/// VPWidenPointerInductionRecipe will generate vectors only. If some users
633/// require vectors while other require scalars, the scalar uses need to extract
634/// the scalars from the generated vectors (Note that this is different to how
635/// int/fp inductions are handled). Legalize extract-from-ends using uniform
636/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
637/// the correct end value is available. Also optimize
638/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
639/// providing them scalar steps built on the canonical scalar IV and update the
640/// original IV's users. This is an optional optimization to reduce the needs of
641/// vector extracts.
644 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
645 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
646 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
647 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
648 if (!PhiR)
649 continue;
650
651 // Try to narrow wide and replicating recipes to uniform recipes, based on
652 // VPlan analysis.
653 // TODO: Apply to all recipes in the future, to replace legacy uniformity
654 // analysis.
655 auto Users = collectUsersRecursively(PhiR);
656 for (VPUser *U : reverse(Users)) {
657 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
658 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
659 // Skip recipes that shouldn't be narrowed.
660 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
661 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
662 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
663 continue;
664
665 // Skip recipes that may have other lanes than their first used.
667 continue;
668
669 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
670 Def->operands(), /*IsUniform*/ true);
671 Clone->insertAfter(Def);
672 Def->replaceAllUsesWith(Clone);
673 }
674
675 // Replace wide pointer inductions which have only their scalars used by
676 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
677 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
678 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
679 continue;
680
681 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
682 VPValue *StartV =
683 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
684 VPValue *StepV = PtrIV->getOperand(1);
686 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
687 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
688
689 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
690 PtrIV->getDebugLoc(), "next.gep");
691
692 PtrIV->replaceAllUsesWith(PtrAdd);
693 continue;
694 }
695
696 // Replace widened induction with scalar steps for users that only use
697 // scalars.
698 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
699 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
700 return U->usesScalars(WideIV);
701 }))
702 continue;
703
704 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
706 Plan, ID.getKind(), ID.getInductionOpcode(),
707 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
708 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
709 WideIV->getDebugLoc(), Builder);
710
711 // Update scalar users of IV to use Step instead.
712 if (!HasOnlyVectorVFs)
713 WideIV->replaceAllUsesWith(Steps);
714 else
715 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
716 return U.usesScalars(WideIV);
717 });
718 }
719}
720
721/// Check if \p VPV is an untruncated wide induction, either before or after the
722/// increment. If so return the header IV (before the increment), otherwise
723/// return null.
725 ScalarEvolution &SE) {
726 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
727 if (WideIV) {
728 // VPV itself is a wide induction, separately compute the end value for exit
729 // users if it is not a truncated IV.
730 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
731 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
732 }
733
734 // Check if VPV is an optimizable induction increment.
735 VPRecipeBase *Def = VPV->getDefiningRecipe();
736 if (!Def || Def->getNumOperands() != 2)
737 return nullptr;
738 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
739 if (!WideIV)
740 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
741 if (!WideIV)
742 return nullptr;
743
744 auto IsWideIVInc = [&]() {
745 auto &ID = WideIV->getInductionDescriptor();
746
747 // Check if VPV increments the induction by the induction step.
748 VPValue *IVStep = WideIV->getStepValue();
749 switch (ID.getInductionOpcode()) {
750 case Instruction::Add:
751 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
752 case Instruction::FAdd:
754 m_Specific(IVStep)));
755 case Instruction::FSub:
756 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
757 m_Specific(IVStep)));
758 case Instruction::Sub: {
759 // IVStep will be the negated step of the subtraction. Check if Step == -1
760 // * IVStep.
761 VPValue *Step;
762 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
763 return false;
764 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
765 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
766 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
767 !isa<SCEVCouldNotCompute>(StepSCEV) &&
768 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
769 }
770 default:
771 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
772 match(VPV, m_GetElementPtr(m_Specific(WideIV),
773 m_Specific(WideIV->getStepValue())));
774 }
775 llvm_unreachable("should have been covered by switch above");
776 };
777 return IsWideIVInc() ? WideIV : nullptr;
778}
779
780/// Attempts to optimize the induction variable exit values for users in the
781/// early exit block.
783 VPTypeAnalysis &TypeInfo,
784 VPBlockBase *PredVPBB,
785 VPValue *Op,
786 ScalarEvolution &SE) {
787 VPValue *Incoming, *Mask;
790 m_VPValue(Mask)),
792 return nullptr;
793
794 auto *WideIV = getOptimizableIVOf(Incoming, SE);
795 if (!WideIV)
796 return nullptr;
797
798 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
799 if (WideIntOrFp && WideIntOrFp->getTruncInst())
800 return nullptr;
801
802 // Calculate the final index.
803 VPValue *EndValue = Plan.getCanonicalIV();
804 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
805 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
806
807 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
808 VPValue *FirstActiveLane =
809 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
810 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
811 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
812 FirstActiveLaneType, DL);
813 EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
814
815 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
816 // changed it means the exit is using the incremented value, so we need to
817 // add the step.
818 if (Incoming != WideIV) {
819 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
820 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
821 }
822
823 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
824 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
825 VPValue *Start = WideIV->getStartValue();
826 VPValue *Step = WideIV->getStepValue();
827 EndValue = B.createDerivedIV(
828 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
829 Start, EndValue, Step);
830 }
831
832 return EndValue;
833}
834
835/// Attempts to optimize the induction variable exit values for users in the
836/// exit block coming from the latch in the original scalar loop.
838 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
842 return nullptr;
843
844 auto *WideIV = getOptimizableIVOf(Incoming, SE);
845 if (!WideIV)
846 return nullptr;
847
848 VPValue *EndValue = EndValues.lookup(WideIV);
849 assert(EndValue && "end value must have been pre-computed");
850
851 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
852 // changed it means the exit is using the incremented value, so we don't
853 // need to subtract the step.
854 if (Incoming != WideIV)
855 return EndValue;
856
857 // Otherwise, subtract the step from the EndValue.
858 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
859 VPValue *Step = WideIV->getStepValue();
860 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
861 if (ScalarTy->isIntegerTy())
862 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
863 if (ScalarTy->isPointerTy()) {
864 Type *StepTy = TypeInfo.inferScalarType(Step);
865 auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
866 return B.createPtrAdd(EndValue,
867 B.createNaryOp(Instruction::Sub, {Zero, Step}),
868 DebugLoc::getUnknown(), "ind.escape");
869 }
870 if (ScalarTy->isFloatingPointTy()) {
871 const auto &ID = WideIV->getInductionDescriptor();
872 return B.createNaryOp(
873 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
874 ? Instruction::FSub
875 : Instruction::FAdd,
876 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
877 }
878 llvm_unreachable("all possible induction types must be handled");
879 return nullptr;
880}
881
883 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
884 ScalarEvolution &SE) {
885 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
886 VPTypeAnalysis TypeInfo(Plan);
887 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
888 for (VPRecipeBase &R : ExitVPBB->phis()) {
889 auto *ExitIRI = cast<VPIRPhi>(&R);
890
891 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
892 VPValue *Escape = nullptr;
893 if (PredVPBB == MiddleVPBB)
894 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
895 ExitIRI->getOperand(Idx),
896 EndValues, SE);
897 else
898 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
899 ExitIRI->getOperand(Idx), SE);
900 if (Escape)
901 ExitIRI->setOperand(Idx, Escape);
902 }
903 }
904 }
905}
906
907/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
908/// them with already existing recipes expanding the same SCEV expression.
911
912 for (VPRecipeBase &R :
914 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
915 if (!ExpR)
916 continue;
917
918 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
919 if (Inserted)
920 continue;
921 ExpR->replaceAllUsesWith(V->second);
922 ExpR->eraseFromParent();
923 }
924}
925
927 SmallVector<VPValue *> WorkList;
929 WorkList.push_back(V);
930
931 while (!WorkList.empty()) {
932 VPValue *Cur = WorkList.pop_back_val();
933 if (!Seen.insert(Cur).second)
934 continue;
936 if (!R)
937 continue;
938 if (!isDeadRecipe(*R))
939 continue;
940 WorkList.append(R->op_begin(), R->op_end());
941 R->eraseFromParent();
942 }
943}
944
945/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
946/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
947/// foldable live-ins.
948static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
950 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
952 for (VPValue *Op : Operands) {
953 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
954 return nullptr;
955 Ops.push_back(Op->getLiveInIRValue());
956 }
957
958 InstSimplifyFolder Folder(DL);
959 if (Instruction::isBinaryOp(Opcode))
960 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
961 Ops[1]);
962 if (Instruction::isCast(Opcode))
963 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
964 TypeInfo.inferScalarType(R.getVPSingleValue()));
965 switch (Opcode) {
967 return Folder.FoldSelect(Ops[0], Ops[1],
970 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
972 case Instruction::Select:
973 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
974 case Instruction::ICmp:
975 case Instruction::FCmp:
976 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
977 Ops[1]);
978 case Instruction::GetElementPtr: {
979 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
980 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
981 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
982 RFlags.getGEPNoWrapFlags());
983 }
986 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
987 Ops[1],
988 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
989 // An extract of a live-in is an extract of a broadcast, so return the
990 // broadcasted element.
991 case Instruction::ExtractElement:
992 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
993 return Ops[0];
994 }
995 return nullptr;
996}
997
998/// Try to simplify recipe \p R.
999static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
1000 VPlan *Plan = R.getParent()->getPlan();
1001
1002 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1003 if (!Def)
1004 return;
1005
1006 // Simplification of live-in IR values for SingleDef recipes using
1007 // InstSimplifyFolder.
1011 const DataLayout &DL =
1013 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1014 TypeInfo);
1015 if (V)
1016 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1017 return V;
1018 })
1019 .Default([](auto *) { return false; }))
1020 return;
1021
1022 // Fold PredPHI LiveIn -> LiveIn.
1023 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1024 VPValue *Op = PredPHI->getOperand(0);
1025 if (Op->isLiveIn())
1026 PredPHI->replaceAllUsesWith(Op);
1027 }
1028
1029 VPValue *A;
1030 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1031 Type *TruncTy = TypeInfo.inferScalarType(Def);
1032 Type *ATy = TypeInfo.inferScalarType(A);
1033 if (TruncTy == ATy) {
1034 Def->replaceAllUsesWith(A);
1035 } else {
1036 // Don't replace a scalarizing recipe with a widened cast.
1037 if (isa<VPReplicateRecipe>(Def))
1038 return;
1039 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1040
1041 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1042 ? Instruction::SExt
1043 : Instruction::ZExt;
1044 auto *VPC =
1045 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1046 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1047 // UnderlyingExt has distinct return type, used to retain legacy cost.
1048 VPC->setUnderlyingValue(UnderlyingExt);
1049 }
1050 VPC->insertBefore(&R);
1051 Def->replaceAllUsesWith(VPC);
1052 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1053 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1054 VPC->insertBefore(&R);
1055 Def->replaceAllUsesWith(VPC);
1056 }
1057 }
1058#ifndef NDEBUG
1059 // Verify that the cached type info is for both A and its users is still
1060 // accurate by comparing it to freshly computed types.
1061 VPTypeAnalysis TypeInfo2(*Plan);
1062 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1063 for (VPUser *U : A->users()) {
1064 auto *R = cast<VPRecipeBase>(U);
1065 for (VPValue *VPV : R->definedValues())
1066 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1067 }
1068#endif
1069 }
1070
1071 // Simplify (X && Y) || (X && !Y) -> X.
1072 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1073 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1074 // recipes to be visited during simplification.
1075 VPValue *X, *Y, *Z;
1076 if (match(Def,
1079 Def->replaceAllUsesWith(X);
1080 Def->eraseFromParent();
1081 return;
1082 }
1083
1084 // x | 1 -> 1
1085 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1086 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1087
1088 // x | 0 -> x
1089 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1090 return Def->replaceAllUsesWith(X);
1091
1092 // x & 0 -> 0
1093 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1094 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1095
1096 // x && false -> false
1097 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1098 return Def->replaceAllUsesWith(Def->getOperand(1));
1099
1100 // (x && y) || (x && z) -> x && (y || z)
1101 VPBuilder Builder(Def);
1104 // Simplify only if one of the operands has one use to avoid creating an
1105 // extra recipe.
1106 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1107 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1108 return Def->replaceAllUsesWith(
1109 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1110
1111 // x && !x -> 0
1113 return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
1115
1116 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1117 return Def->replaceAllUsesWith(X);
1118
1119 // select !c, x, y -> select c, y, x
1120 VPValue *C;
1121 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1122 Def->setOperand(0, C);
1123 Def->setOperand(1, Y);
1124 Def->setOperand(2, X);
1125 return;
1126 }
1127
1128 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1129 // tail folding it is likely that x is a header mask and can be simplified
1130 // further.
1132 m_VPValue(Z))) &&
1133 X->hasMoreThanOneUniqueUser())
1134 return Def->replaceAllUsesWith(
1135 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1136
1137 if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
1138 return Def->replaceAllUsesWith(A);
1139
1140 if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))
1141 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1142 : R.getOperand(0));
1143
1144 if (match(Def, m_Not(m_VPValue(A)))) {
1145 if (match(A, m_Not(m_VPValue(A))))
1146 return Def->replaceAllUsesWith(A);
1147
1148 // Try to fold Not into compares by adjusting the predicate in-place.
1149 CmpPredicate Pred;
1150 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1151 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1152 if (all_of(Cmp->users(), [&Cmp](VPUser *U) {
1153 return match(U, m_CombineOr(m_Not(m_Specific(Cmp)),
1154 m_Select(m_Specific(Cmp), m_VPValue(),
1155 m_VPValue())));
1156 })) {
1157 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1158 for (VPUser *U : to_vector(Cmp->users())) {
1159 auto *R = cast<VPSingleDefRecipe>(U);
1160 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1161 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1162 R->setOperand(1, Y);
1163 R->setOperand(2, X);
1164 } else {
1165 // not (cmp pred) -> cmp inv_pred
1166 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1167 R->replaceAllUsesWith(Cmp);
1168 }
1169 }
1170 // If Cmp doesn't have a debug location, use the one from the negation,
1171 // to preserve the location.
1172 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1173 Cmp->setDebugLoc(R.getDebugLoc());
1174 }
1175 }
1176 }
1177
1178 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1179 if ((match(Def,
1181 match(Def,
1183 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1184 TypeInfo.inferScalarType(Def))
1185 return Def->replaceAllUsesWith(Def->getOperand(1));
1186
1188 m_VPValue(X), m_SpecificInt(1)))) {
1189 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1190 if (TypeInfo.inferScalarType(X) != WideStepTy)
1191 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1192 Def->replaceAllUsesWith(X);
1193 return;
1194 }
1195
1196 // For i1 vp.merges produced by AnyOf reductions:
1197 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1199 m_VPValue(X), m_VPValue())) &&
1201 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1202 Def->setOperand(1, Def->getOperand(0));
1203 Def->setOperand(0, Y);
1204 return;
1205 }
1206
1207 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1208 if (Phi->getOperand(0) == Phi->getOperand(1))
1209 Def->replaceAllUsesWith(Phi->getOperand(0));
1210 return;
1211 }
1212
1213 // Look through ExtractLastElement (BuildVector ....).
1215 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1216 Def->replaceAllUsesWith(
1217 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1218 return;
1219 }
1220
1221 // Look through ExtractPenultimateElement (BuildVector ....).
1223 m_BuildVector()))) {
1224 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1225 Def->replaceAllUsesWith(
1226 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1227 return;
1228 }
1229
1230 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1231 if (Phi->getNumOperands() == 1)
1232 Phi->replaceAllUsesWith(Phi->getOperand(0));
1233 return;
1234 }
1235
1236 // Some simplifications can only be applied after unrolling. Perform them
1237 // below.
1238 if (!Plan->isUnrolled())
1239 return;
1240
1241 // VPVectorPointer for part 0 can be replaced by their start pointer.
1242 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1243 if (VecPtr->isFirstPart()) {
1244 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1245 return;
1246 }
1247 }
1248
1249 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1250 // the first lane is demanded.
1251 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1252 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1253 Steps->replaceAllUsesWith(Steps->getOperand(0));
1254 return;
1255 }
1256 }
1257 // Simplify redundant ReductionStartVector recipes after unrolling.
1258 VPValue *StartV;
1260 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1261 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1262 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1263 return PhiR && PhiR->isInLoop();
1264 });
1265 return;
1266 }
1267
1269 Def->replaceAllUsesWith(A);
1270 return;
1271 }
1272
1273 if (match(Def,
1275 vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) {
1276 return U->usesScalars(A) || Def == U;
1277 })) {
1278 return Def->replaceAllUsesWith(A);
1279 }
1280}
1281
1284 Plan.getEntry());
1285 VPTypeAnalysis TypeInfo(Plan);
1287 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1288 simplifyRecipe(R, TypeInfo);
1289 }
1290 }
1291}
1292
1294 if (Plan.hasScalarVFOnly())
1295 return;
1296
1297 // Try to narrow wide and replicating recipes to single scalar recipes,
1298 // based on VPlan analysis. Only process blocks in the loop region for now,
1299 // without traversing into nested regions, as recipes in replicate regions
1300 // cannot be converted yet.
1303 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1305 continue;
1306 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1307 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1308 continue;
1309
1310 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1311 // Skip recipes that aren't single scalars or don't have only their
1312 // scalar results used. In the latter case, we would introduce extra
1313 // broadcasts.
1314 if (!vputils::isSingleScalar(RepOrWidenR) ||
1315 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1316 return U->usesScalars(RepOrWidenR) ||
1317 match(cast<VPRecipeBase>(U),
1318 m_ExtractLastElement(m_VPValue()));
1319 }))
1320 continue;
1321
1322 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1323 RepOrWidenR->operands(),
1324 true /*IsSingleScalar*/);
1325 Clone->insertBefore(RepOrWidenR);
1326 RepOrWidenR->replaceAllUsesWith(Clone);
1327 }
1328 }
1329}
1330
1331/// Try to see if all of \p Blend's masks share a common value logically and'ed
1332/// and remove it from the masks.
1334 if (Blend->isNormalized())
1335 return;
1336 VPValue *CommonEdgeMask;
1337 if (!match(Blend->getMask(0),
1338 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1339 return;
1340 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1341 if (!match(Blend->getMask(I),
1342 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1343 return;
1344 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1345 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1346}
1347
1348/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1349/// to make sure the masks are simplified.
1350static void simplifyBlends(VPlan &Plan) {
1353 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1354 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1355 if (!Blend)
1356 continue;
1357
1358 removeCommonBlendMask(Blend);
1359
1360 // Try to remove redundant blend recipes.
1361 SmallPtrSet<VPValue *, 4> UniqueValues;
1362 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1363 UniqueValues.insert(Blend->getIncomingValue(0));
1364 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1365 if (!match(Blend->getMask(I), m_False()))
1366 UniqueValues.insert(Blend->getIncomingValue(I));
1367
1368 if (UniqueValues.size() == 1) {
1369 Blend->replaceAllUsesWith(*UniqueValues.begin());
1370 Blend->eraseFromParent();
1371 continue;
1372 }
1373
1374 if (Blend->isNormalized())
1375 continue;
1376
1377 // Normalize the blend so its first incoming value is used as the initial
1378 // value with the others blended into it.
1379
1380 unsigned StartIndex = 0;
1381 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1382 // If a value's mask is used only by the blend then is can be deadcoded.
1383 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1384 // that's used by multiple blends where it can be removed from them all.
1385 VPValue *Mask = Blend->getMask(I);
1386 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1387 StartIndex = I;
1388 break;
1389 }
1390 }
1391
1392 SmallVector<VPValue *, 4> OperandsWithMask;
1393 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1394
1395 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1396 if (I == StartIndex)
1397 continue;
1398 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1399 OperandsWithMask.push_back(Blend->getMask(I));
1400 }
1401
1402 auto *NewBlend =
1403 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1404 OperandsWithMask, Blend->getDebugLoc());
1405 NewBlend->insertBefore(&R);
1406
1407 VPValue *DeadMask = Blend->getMask(StartIndex);
1408 Blend->replaceAllUsesWith(NewBlend);
1409 Blend->eraseFromParent();
1411
1412 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1413 VPValue *NewMask;
1414 if (NewBlend->getNumOperands() == 3 &&
1415 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1416 VPValue *Inc0 = NewBlend->getOperand(0);
1417 VPValue *Inc1 = NewBlend->getOperand(1);
1418 VPValue *OldMask = NewBlend->getOperand(2);
1419 NewBlend->setOperand(0, Inc1);
1420 NewBlend->setOperand(1, Inc0);
1421 NewBlend->setOperand(2, NewMask);
1422 if (OldMask->getNumUsers() == 0)
1423 cast<VPInstruction>(OldMask)->eraseFromParent();
1424 }
1425 }
1426 }
1427}
1428
1429/// Optimize the width of vector induction variables in \p Plan based on a known
1430/// constant Trip Count, \p BestVF and \p BestUF.
1432 ElementCount BestVF,
1433 unsigned BestUF) {
1434 // Only proceed if we have not completely removed the vector region.
1435 if (!Plan.getVectorLoopRegion())
1436 return false;
1437
1438 if (!Plan.getTripCount()->isLiveIn())
1439 return false;
1442 if (!TC || !BestVF.isFixed())
1443 return false;
1444
1445 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1446 // and UF. Returns at least 8.
1447 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1448 APInt AlignedTC =
1451 APInt MaxVal = AlignedTC - 1;
1452 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1453 };
1454 unsigned NewBitWidth =
1455 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1456
1457 LLVMContext &Ctx = Plan.getContext();
1458 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1459
1460 bool MadeChange = false;
1461
1462 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1463 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1464 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1465
1466 // Currently only handle canonical IVs as it is trivial to replace the start
1467 // and stop values, and we currently only perform the optimization when the
1468 // IV has a single use.
1469 if (!WideIV || !WideIV->isCanonical() ||
1470 WideIV->hasMoreThanOneUniqueUser() ||
1471 NewIVTy == WideIV->getScalarType())
1472 continue;
1473
1474 // Currently only handle cases where the single user is a header-mask
1475 // comparison with the backedge-taken-count.
1476 if (!match(*WideIV->user_begin(),
1477 m_ICmp(m_Specific(WideIV),
1480 continue;
1481
1482 // Update IV operands and comparison bound to use new narrower type.
1483 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1484 WideIV->setStartValue(NewStart);
1485 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1486 WideIV->setStepValue(NewStep);
1487
1488 auto *NewBTC = new VPWidenCastRecipe(
1489 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1490 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1491 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1492 Cmp->setOperand(1, NewBTC);
1493
1494 MadeChange = true;
1495 }
1496
1497 return MadeChange;
1498}
1499
1500/// Return true if \p Cond is known to be true for given \p BestVF and \p
1501/// BestUF.
1503 ElementCount BestVF, unsigned BestUF,
1504 ScalarEvolution &SE) {
1506 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1507 &SE](VPValue *C) {
1508 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1509 });
1510
1511 auto *CanIV = Plan.getCanonicalIV();
1513 m_Specific(CanIV->getBackedgeValue()),
1514 m_Specific(&Plan.getVectorTripCount()))))
1515 return false;
1516
1517 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1518 // count is not conveniently available as SCEV so far, so we compare directly
1519 // against the original trip count. This is stricter than necessary, as we
1520 // will only return true if the trip count == vector trip count.
1521 const SCEV *VectorTripCount =
1523 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1524 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1525 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1526 "Trip count SCEV must be computable");
1527 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1528 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1529 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1530}
1531
1532/// Try to replace multiple active lane masks used for control flow with
1533/// a single, wide active lane mask instruction followed by multiple
1534/// extract subvector intrinsics. This applies to the active lane mask
1535/// instructions both in the loop and in the preheader.
1536/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1537/// new extracts from the first active lane mask, which has it's last
1538/// operand (multiplier) set to UF.
1540 unsigned UF) {
1541 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1542 return false;
1543
1544 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1545 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1546 auto *Term = &ExitingVPBB->back();
1547
1548 using namespace llvm::VPlanPatternMatch;
1550 m_VPValue(), m_VPValue(), m_VPValue())))))
1551 return false;
1552
1553 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1554 LLVMContext &Ctx = Plan.getContext();
1555
1556 auto ExtractFromALM = [&](VPInstruction *ALM,
1557 SmallVectorImpl<VPValue *> &Extracts) {
1558 DebugLoc DL = ALM->getDebugLoc();
1559 for (unsigned Part = 0; Part < UF; ++Part) {
1561 Ops.append({ALM, Plan.getOrAddLiveIn(
1562 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1563 VF.getKnownMinValue() * Part))});
1564 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1566 Extracts[Part] = Ext;
1567 Ext->insertAfter(ALM);
1568 }
1569 };
1570
1571 // Create a list of each active lane mask phi, ordered by unroll part.
1573 for (VPRecipeBase &R : Header->phis()) {
1575 if (!Phi)
1576 continue;
1577 VPValue *Index = nullptr;
1578 match(Phi->getBackedgeValue(),
1580 assert(Index && "Expected index from ActiveLaneMask instruction");
1581
1582 auto *II = dyn_cast<VPInstruction>(Index);
1583 if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
1584 auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue());
1585 Phis[Part->getZExtValue()] = Phi;
1586 } else
1587 // Anything other than a CanonicalIVIncrementForPart is part 0
1588 Phis[0] = Phi;
1589 }
1590
1591 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1592 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1593
1594 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1595 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1596
1597 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1598 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1599 "Expected incoming values of Phi to be ActiveLaneMasks");
1600
1601 // When using wide lane masks, the return type of the get.active.lane.mask
1602 // intrinsic is VF x UF (last operand).
1603 VPValue *ALMMultiplier =
1604 Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
1605 EntryALM->setOperand(2, ALMMultiplier);
1606 LoopALM->setOperand(2, ALMMultiplier);
1607
1608 // Create UF x extract vectors and insert into preheader.
1609 SmallVector<VPValue *> EntryExtracts(UF);
1610 ExtractFromALM(EntryALM, EntryExtracts);
1611
1612 // Create UF x extract vectors and insert before the loop compare & branch,
1613 // updating the compare to use the first extract.
1614 SmallVector<VPValue *> LoopExtracts(UF);
1615 ExtractFromALM(LoopALM, LoopExtracts);
1616 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1617 Not->setOperand(0, LoopExtracts[0]);
1618
1619 // Update the incoming values of active lane mask phis.
1620 for (unsigned Part = 0; Part < UF; ++Part) {
1621 Phis[Part]->setStartValue(EntryExtracts[Part]);
1622 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1623 }
1624
1625 return true;
1626}
1627
1628/// Try to simplify the branch condition of \p Plan. This may restrict the
1629/// resulting plan to \p BestVF and \p BestUF.
1631 unsigned BestUF,
1633 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1634 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1635 auto *Term = &ExitingVPBB->back();
1636 VPValue *Cond;
1637 ScalarEvolution &SE = *PSE.getSE();
1638 if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1640 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1641 // Try to simplify the branch condition if TC <= VF * UF when the latch
1642 // terminator is BranchOnCount or BranchOnCond where the input is
1643 // Not(ActiveLaneMask).
1644 const SCEV *TripCount =
1646 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1647 "Trip count SCEV must be computable");
1648 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1649 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1650 if (TripCount->isZero() ||
1651 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1652 return false;
1653 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1654 // For BranchOnCond, check if we can prove the condition to be true using VF
1655 // and UF.
1656 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1657 return false;
1658 } else {
1659 return false;
1660 }
1661
1662 // The vector loop region only executes once. If possible, completely remove
1663 // the region, otherwise replace the terminator controlling the latch with
1664 // (BranchOnCond true).
1665 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1666 // support for other non-canonical widen induction recipes (e.g.,
1667 // VPWidenPointerInductionRecipe).
1668 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1669 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1670 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1671 return R->isCanonical();
1672 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1673 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1674 })) {
1675 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1676 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1677 VPBuilder Builder(Plan.getVectorPreheader());
1678 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1679 R->getScalarType());
1680 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1681 HeaderR.eraseFromParent();
1682 continue;
1683 }
1684 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1685 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1686 HeaderR.eraseFromParent();
1687 }
1688
1689 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1690 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1691 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1692 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1693
1694 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1695 B->setParent(nullptr);
1696
1697 VPBlockUtils::connectBlocks(Preheader, Header);
1698 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1700 } else {
1701 // The vector region contains header phis for which we cannot remove the
1702 // loop region yet.
1703 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1704 Term->getDebugLoc());
1705 ExitingVPBB->appendRecipe(BOC);
1706 }
1707
1708 Term->eraseFromParent();
1709
1710 return true;
1711}
1712
1714 unsigned BestUF,
1716 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1717 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1718
1719 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1720 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1721 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1722
1723 if (MadeChange) {
1724 Plan.setVF(BestVF);
1725 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1726 }
1727 // TODO: Further simplifications are possible
1728 // 1. Replace inductions with constants.
1729 // 2. Replace vector loop region with VPBasicBlock.
1730}
1731
1732/// Sink users of \p FOR after the recipe defining the previous value \p
1733/// Previous of the recurrence. \returns true if all users of \p FOR could be
1734/// re-arranged as needed or false if it is not possible.
1735static bool
1737 VPRecipeBase *Previous,
1738 VPDominatorTree &VPDT) {
1739 // Collect recipes that need sinking.
1742 Seen.insert(Previous);
1743 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1744 // The previous value must not depend on the users of the recurrence phi. In
1745 // that case, FOR is not a fixed order recurrence.
1746 if (SinkCandidate == Previous)
1747 return false;
1748
1749 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1750 !Seen.insert(SinkCandidate).second ||
1751 VPDT.properlyDominates(Previous, SinkCandidate))
1752 return true;
1753
1754 if (SinkCandidate->mayHaveSideEffects())
1755 return false;
1756
1757 WorkList.push_back(SinkCandidate);
1758 return true;
1759 };
1760
1761 // Recursively sink users of FOR after Previous.
1762 WorkList.push_back(FOR);
1763 for (unsigned I = 0; I != WorkList.size(); ++I) {
1764 VPRecipeBase *Current = WorkList[I];
1765 assert(Current->getNumDefinedValues() == 1 &&
1766 "only recipes with a single defined value expected");
1767
1768 for (VPUser *User : Current->getVPSingleValue()->users()) {
1769 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1770 return false;
1771 }
1772 }
1773
1774 // Keep recipes to sink ordered by dominance so earlier instructions are
1775 // processed first.
1776 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1777 return VPDT.properlyDominates(A, B);
1778 });
1779
1780 for (VPRecipeBase *SinkCandidate : WorkList) {
1781 if (SinkCandidate == FOR)
1782 continue;
1783
1784 SinkCandidate->moveAfter(Previous);
1785 Previous = SinkCandidate;
1786 }
1787 return true;
1788}
1789
1790/// Try to hoist \p Previous and its operands before all users of \p FOR.
1792 VPRecipeBase *Previous,
1793 VPDominatorTree &VPDT) {
1794 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1795 return false;
1796
1797 // Collect recipes that need hoisting.
1798 SmallVector<VPRecipeBase *> HoistCandidates;
1800 VPRecipeBase *HoistPoint = nullptr;
1801 // Find the closest hoist point by looking at all users of FOR and selecting
1802 // the recipe dominating all other users.
1803 for (VPUser *U : FOR->users()) {
1804 auto *R = cast<VPRecipeBase>(U);
1805 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1806 HoistPoint = R;
1807 }
1808 assert(all_of(FOR->users(),
1809 [&VPDT, HoistPoint](VPUser *U) {
1810 auto *R = cast<VPRecipeBase>(U);
1811 return HoistPoint == R ||
1812 VPDT.properlyDominates(HoistPoint, R);
1813 }) &&
1814 "HoistPoint must dominate all users of FOR");
1815
1816 auto NeedsHoisting = [HoistPoint, &VPDT,
1817 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1818 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1819 if (!HoistCandidate)
1820 return nullptr;
1821 VPRegionBlock *EnclosingLoopRegion =
1822 HoistCandidate->getParent()->getEnclosingLoopRegion();
1823 assert((!HoistCandidate->getParent()->getParent() ||
1824 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1825 "CFG in VPlan should still be flat, without replicate regions");
1826 // Hoist candidate was already visited, no need to hoist.
1827 if (!Visited.insert(HoistCandidate).second)
1828 return nullptr;
1829
1830 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1831 // hoisting.
1832 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1833 return nullptr;
1834
1835 // If we reached a recipe that dominates HoistPoint, we don't need to
1836 // hoist the recipe.
1837 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1838 return nullptr;
1839 return HoistCandidate;
1840 };
1841 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1842 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1843 // associated dependencies.
1844 return !HoistCandidate->mayHaveSideEffects();
1845 };
1846
1847 if (!NeedsHoisting(Previous->getVPSingleValue()))
1848 return true;
1849
1850 // Recursively try to hoist Previous and its operands before all users of FOR.
1851 HoistCandidates.push_back(Previous);
1852
1853 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1854 VPRecipeBase *Current = HoistCandidates[I];
1855 assert(Current->getNumDefinedValues() == 1 &&
1856 "only recipes with a single defined value expected");
1857 if (!CanHoist(Current))
1858 return false;
1859
1860 for (VPValue *Op : Current->operands()) {
1861 // If we reach FOR, it means the original Previous depends on some other
1862 // recurrence that in turn depends on FOR. If that is the case, we would
1863 // also need to hoist recipes involving the other FOR, which may break
1864 // dependencies.
1865 if (Op == FOR)
1866 return false;
1867
1868 if (auto *R = NeedsHoisting(Op))
1869 HoistCandidates.push_back(R);
1870 }
1871 }
1872
1873 // Order recipes to hoist by dominance so earlier instructions are processed
1874 // first.
1875 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1876 return VPDT.properlyDominates(A, B);
1877 });
1878
1879 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1880 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1881 HoistPoint->getIterator());
1882 }
1883
1884 return true;
1885}
1886
1888 VPBuilder &LoopBuilder) {
1889 VPDominatorTree VPDT;
1890 VPDT.recalculate(Plan);
1891
1893 for (VPRecipeBase &R :
1896 RecurrencePhis.push_back(FOR);
1897
1898 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1900 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1901 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1902 // to terminate.
1903 while (auto *PrevPhi =
1905 assert(PrevPhi->getParent() == FOR->getParent());
1906 assert(SeenPhis.insert(PrevPhi).second);
1907 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1908 }
1909
1910 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1911 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1912 return false;
1913
1914 // Introduce a recipe to combine the incoming and previous values of a
1915 // fixed-order recurrence.
1916 VPBasicBlock *InsertBlock = Previous->getParent();
1917 if (isa<VPHeaderPHIRecipe>(Previous))
1918 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1919 else
1920 LoopBuilder.setInsertPoint(InsertBlock,
1921 std::next(Previous->getIterator()));
1922
1923 auto *RecurSplice =
1925 {FOR, FOR->getBackedgeValue()});
1926
1927 FOR->replaceAllUsesWith(RecurSplice);
1928 // Set the first operand of RecurSplice to FOR again, after replacing
1929 // all users.
1930 RecurSplice->setOperand(0, FOR);
1931 }
1932 return true;
1933}
1934
1936 for (VPRecipeBase &R :
1938 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1939 if (!PhiR)
1940 continue;
1941 RecurKind RK = PhiR->getRecurrenceKind();
1942 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
1944 continue;
1945
1946 for (VPUser *U : collectUsersRecursively(PhiR))
1947 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1948 RecWithFlags->dropPoisonGeneratingFlags();
1949 }
1950 }
1951}
1952
1953namespace {
1954struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1955 static bool isSentinel(const VPSingleDefRecipe *Def) {
1956 return Def == getEmptyKey() || Def == getTombstoneKey();
1957 }
1958
1959 /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1960 /// Returns an optional pair, where the first element indicates whether it is
1961 /// an intrinsic ID.
1962 static std::optional<std::pair<bool, unsigned>>
1963 getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1964 return TypeSwitch<const VPSingleDefRecipe *,
1965 std::optional<std::pair<bool, unsigned>>>(R)
1968 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1969 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1970 return std::make_pair(true, I->getVectorIntrinsicID());
1971 })
1972 .Default([](auto *) { return std::nullopt; });
1973 }
1974
1975 /// Returns true if recipe \p Def can be safely handed for CSE.
1976 static bool canHandle(const VPSingleDefRecipe *Def) {
1977 // We can extend the list of handled recipes in the future,
1978 // provided we account for the data embedded in them while checking for
1979 // equality or hashing.
1980 auto C = getOpcodeOrIntrinsicID(Def);
1981
1982 // The issue with (Insert|Extract)Value is that the index of the
1983 // insert/extract is not a proper operand in LLVM IR, and hence also not in
1984 // VPlan.
1985 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
1986 C->second == Instruction::ExtractValue)))
1987 return false;
1988
1989 // During CSE, we can only handle recipes that don't read from memory: if
1990 // they read from memory, there could be an intervening write to memory
1991 // before the next instance is CSE'd, leading to an incorrect result.
1992 return !Def->mayReadFromMemory();
1993 }
1994
1995 /// Hash the underlying data of \p Def.
1996 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
1997 const VPlan *Plan = Def->getParent()->getPlan();
1998 VPTypeAnalysis TypeInfo(*Plan);
1999 hash_code Result = hash_combine(
2000 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2001 TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
2002 hash_combine_range(Def->operands()));
2003 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2004 if (RFlags->hasPredicate())
2005 return hash_combine(Result, RFlags->getPredicate());
2006 return Result;
2007 }
2008
2009 /// Check equality of underlying data of \p L and \p R.
2010 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2011 if (isSentinel(L) || isSentinel(R))
2012 return L == R;
2013 if (L->getVPDefID() != R->getVPDefID() ||
2014 getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
2016 !equal(L->operands(), R->operands()))
2017 return false;
2018 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2019 if (LFlags->hasPredicate() &&
2020 LFlags->getPredicate() !=
2021 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2022 return false;
2023 const VPlan *Plan = L->getParent()->getPlan();
2024 VPTypeAnalysis TypeInfo(*Plan);
2025 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2026 }
2027};
2028} // end anonymous namespace
2029
2030/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2031/// Plan.
2033 VPDominatorTree VPDT(Plan);
2035
2037 vp_depth_first_deep(Plan.getEntry()))) {
2038 for (VPRecipeBase &R : *VPBB) {
2039 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2040 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2041 continue;
2042 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2043 // V must dominate Def for a valid replacement.
2044 if (!VPDT.dominates(V->getParent(), VPBB))
2045 continue;
2046 // Only keep flags present on both V and Def.
2047 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2048 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2049 Def->replaceAllUsesWith(V);
2050 continue;
2051 }
2052 CSEMap[Def] = Def;
2053 }
2054 }
2055}
2056
2057/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2058static void licm(VPlan &Plan) {
2059 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2060
2061 // Return true if we do not know how to (mechanically) hoist a given recipe
2062 // out of a loop region. Does not address legality concerns such as aliasing
2063 // or speculation safety.
2064 auto CannotHoistRecipe = [](VPRecipeBase &R) {
2065 // Allocas cannot be hoisted.
2066 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2067 return RepR && RepR->getOpcode() == Instruction::Alloca;
2068 };
2069
2070 // Hoist any loop invariant recipes from the vector loop region to the
2071 // preheader. Preform a shallow traversal of the vector loop region, to
2072 // exclude recipes in replicate regions.
2073 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2075 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2076 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2077 if (CannotHoistRecipe(R))
2078 continue;
2079 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
2080 // their memory location is not modified in the vector loop.
2081 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
2082 any_of(R.operands(), [](VPValue *Op) {
2083 return !Op->isDefinedOutsideLoopRegions();
2084 }))
2085 continue;
2086 R.moveBefore(*Preheader, Preheader->end());
2087 }
2088 }
2089}
2090
2092 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2093 // Keep track of created truncates, so they can be re-used. Note that we
2094 // cannot use RAUW after creating a new truncate, as this would could make
2095 // other uses have different types for their operands, making them invalidly
2096 // typed.
2098 VPTypeAnalysis TypeInfo(Plan);
2099 VPBasicBlock *PH = Plan.getVectorPreheader();
2102 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2105 &R))
2106 continue;
2107
2108 VPValue *ResultVPV = R.getVPSingleValue();
2109 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2110 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2111 if (!NewResSizeInBits)
2112 continue;
2113
2114 // If the value wasn't vectorized, we must maintain the original scalar
2115 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2116 // skip casts which do not need to be handled explicitly here, as
2117 // redundant casts will be removed during recipe simplification.
2119 continue;
2120
2121 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2122 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2123 assert(OldResTy->isIntegerTy() && "only integer types supported");
2124 (void)OldResSizeInBits;
2125
2126 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2127
2128 // Any wrapping introduced by shrinking this operation shouldn't be
2129 // considered undefined behavior. So, we can't unconditionally copy
2130 // arithmetic wrapping flags to VPW.
2131 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2132 VPW->dropPoisonGeneratingFlags();
2133
2134 if (OldResSizeInBits != NewResSizeInBits &&
2135 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2136 // Extend result to original width.
2137 auto *Ext =
2138 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2139 Ext->insertAfter(&R);
2140 ResultVPV->replaceAllUsesWith(Ext);
2141 Ext->setOperand(0, ResultVPV);
2142 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2143 } else {
2144 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2145 "Only ICmps should not need extending the result.");
2146 }
2147
2148 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2150 continue;
2151
2152 // Shrink operands by introducing truncates as needed.
2153 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2154 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2155 auto *Op = R.getOperand(Idx);
2156 unsigned OpSizeInBits =
2158 if (OpSizeInBits == NewResSizeInBits)
2159 continue;
2160 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2161 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2162 VPWidenCastRecipe *NewOp =
2163 IterIsEmpty
2164 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
2165 : ProcessedIter->second;
2166 R.setOperand(Idx, NewOp);
2167 if (!IterIsEmpty)
2168 continue;
2169 ProcessedIter->second = NewOp;
2170 if (!Op->isLiveIn()) {
2171 NewOp->insertBefore(&R);
2172 } else {
2173 PH->appendRecipe(NewOp);
2174 }
2175 }
2176
2177 }
2178 }
2179}
2180
2184 VPValue *Cond;
2185 if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
2186 !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2187 continue;
2188
2189 unsigned RemovedIdx;
2190 if (match(Cond, m_True()))
2191 RemovedIdx = 1;
2192 else if (match(Cond, m_False()))
2193 RemovedIdx = 0;
2194 else
2195 continue;
2196
2197 VPBasicBlock *RemovedSucc =
2198 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2199 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2200 "There must be a single edge between VPBB and its successor");
2201 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2202 // these recipes.
2203 for (VPRecipeBase &R : RemovedSucc->phis())
2204 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2205
2206 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2207 // automatically on VPlan destruction if it becomes unreachable.
2208 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2209 VPBB->back().eraseFromParent();
2210 }
2211}
2212
2231
2232// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2233// the loop terminator with a branch-on-cond recipe with the negated
2234// active-lane-mask as operand. Note that this turns the loop into an
2235// uncountable one. Only the existing terminator is replaced, all other existing
2236// recipes/users remain unchanged, except for poison-generating flags being
2237// dropped from the canonical IV increment. Return the created
2238// VPActiveLaneMaskPHIRecipe.
2239//
2240// The function uses the following definitions:
2241//
2242// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2243// calculate-trip-count-minus-VF (original TC) : original TC
2244// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2245// CanonicalIVPhi : CanonicalIVIncrement
2246// %StartV is the canonical induction start value.
2247//
2248// The function adds the following recipes:
2249//
2250// vector.ph:
2251// %TripCount = calculate-trip-count-minus-VF (original TC)
2252// [if DataWithControlFlowWithoutRuntimeCheck]
2253// %EntryInc = canonical-iv-increment-for-part %StartV
2254// %EntryALM = active-lane-mask %EntryInc, %TripCount
2255//
2256// vector.body:
2257// ...
2258// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2259// ...
2260// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2261// %ALM = active-lane-mask %InLoopInc, TripCount
2262// %Negated = Not %ALM
2263// branch-on-cond %Negated
2264//
2267 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2268 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2269 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2270 VPValue *StartV = CanonicalIVPHI->getStartValue();
2271
2272 auto *CanonicalIVIncrement =
2273 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2274 // TODO: Check if dropping the flags is needed if
2275 // !DataAndControlFlowWithoutRuntimeCheck.
2276 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2277 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2278 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2279 // we have to take unrolling into account. Each part needs to start at
2280 // Part * VF
2281 auto *VecPreheader = Plan.getVectorPreheader();
2282 VPBuilder Builder(VecPreheader);
2283
2284 // Create the ActiveLaneMask instruction using the correct start values.
2285 VPValue *TC = Plan.getTripCount();
2286
2287 VPValue *TripCount, *IncrementValue;
2289 // When the loop is guarded by a runtime overflow check for the loop
2290 // induction variable increment by VF, we can increment the value before
2291 // the get.active.lane mask and use the unmodified tripcount.
2292 IncrementValue = CanonicalIVIncrement;
2293 TripCount = TC;
2294 } else {
2295 // When avoiding a runtime check, the active.lane.mask inside the loop
2296 // uses a modified trip count and the induction variable increment is
2297 // done after the active.lane.mask intrinsic is called.
2298 IncrementValue = CanonicalIVPHI;
2299 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2300 {TC}, DL);
2301 }
2302 auto *EntryIncrement = Builder.createOverflowingOp(
2303 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2304 "index.part.next");
2305
2306 // Create the active lane mask instruction in the VPlan preheader.
2307 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2308 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2309 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2310 {EntryIncrement, TC, ALMMultiplier}, DL,
2311 "active.lane.mask.entry");
2312
2313 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2314 // preheader ActiveLaneMask instruction.
2315 auto *LaneMaskPhi =
2317 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2318
2319 // Create the active lane mask for the next iteration of the loop before the
2320 // original terminator.
2321 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2322 Builder.setInsertPoint(OriginalTerminator);
2323 auto *InLoopIncrement =
2324 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2325 {IncrementValue}, {false, false}, DL);
2326 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2327 {InLoopIncrement, TripCount, ALMMultiplier},
2328 DL, "active.lane.mask.next");
2329 LaneMaskPhi->addOperand(ALM);
2330
2331 // Replace the original terminator with BranchOnCond. We have to invert the
2332 // mask here because a true condition means jumping to the exit block.
2333 auto *NotMask = Builder.createNot(ALM, DL);
2334 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2335 OriginalTerminator->eraseFromParent();
2336 return LaneMaskPhi;
2337}
2338
2339/// Collect the header mask with the pattern:
2340/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2341/// TODO: Introduce explicit recipe for header-mask instead of searching
2342/// for the header-mask pattern manually.
2344 SmallVector<VPValue *> WideCanonicalIVs;
2345 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2349 "Must have at most one VPWideCanonicalIVRecipe");
2350 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2351 auto *WideCanonicalIV =
2352 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2353 WideCanonicalIVs.push_back(WideCanonicalIV);
2354 }
2355
2356 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2357 // version of the canonical induction.
2358 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2359 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2360 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2361 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2362 WideCanonicalIVs.push_back(WidenOriginalIV);
2363 }
2364
2365 // Walk users of wide canonical IVs and find the single compare of the form
2366 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2367 VPSingleDefRecipe *HeaderMask = nullptr;
2368 for (auto *Wide : WideCanonicalIVs) {
2369 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2370 auto *VPI = dyn_cast<VPInstruction>(U);
2371 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2372 continue;
2373
2374 assert(VPI->getOperand(0) == Wide &&
2375 "WidenCanonicalIV must be the first operand of the compare");
2376 assert(!HeaderMask && "Multiple header masks found?");
2377 HeaderMask = VPI;
2378 }
2379 }
2380 return HeaderMask;
2381}
2382
2384 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2387 UseActiveLaneMaskForControlFlow) &&
2388 "DataAndControlFlowWithoutRuntimeCheck implies "
2389 "UseActiveLaneMaskForControlFlow");
2390
2391 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2393 assert(FoundWidenCanonicalIVUser &&
2394 "Must have widened canonical IV when tail folding!");
2395 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2396 auto *WideCanonicalIV =
2397 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2398 VPSingleDefRecipe *LaneMask;
2399 if (UseActiveLaneMaskForControlFlow) {
2402 } else {
2403 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2404 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2405 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2406 LaneMask =
2407 B.createNaryOp(VPInstruction::ActiveLaneMask,
2408 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2409 nullptr, "active.lane.mask");
2410 }
2411
2412 // Walk users of WideCanonicalIV and replace the header mask of the form
2413 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2414 // removing the old one to ensure there is always only a single header mask.
2415 HeaderMask->replaceAllUsesWith(LaneMask);
2416 HeaderMask->eraseFromParent();
2417}
2418
2419/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2420/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2421/// recipe could be created.
2422/// \p HeaderMask Header Mask.
2423/// \p CurRecipe Recipe to be transform.
2424/// \p TypeInfo VPlan-based type analysis.
2425/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2426/// \p EVL The explicit vector length parameter of vector-predication
2427/// intrinsics.
2429 VPRecipeBase &CurRecipe,
2430 VPTypeAnalysis &TypeInfo,
2431 VPValue &AllOneMask, VPValue &EVL) {
2432 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2433 // header mask.
2434 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2435 assert(OrigMask && "Unmasked recipe when folding tail");
2436 // HeaderMask will be handled using EVL.
2437 VPValue *Mask;
2438 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2439 return Mask;
2440 return HeaderMask == OrigMask ? nullptr : OrigMask;
2441 };
2442
2443 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2444 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2445 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2446 if (!EndPtr)
2447 return Addr;
2448 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2449 "VPVectorEndPointerRecipe with non-VF VF operand?");
2450 assert(
2451 all_of(EndPtr->users(),
2452 [](VPUser *U) {
2453 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2454 }) &&
2455 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2456 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2457 EVLAddr->insertBefore(&CurRecipe);
2458 EVLAddr->setOperand(1, &EVL);
2459 return EVLAddr;
2460 };
2461
2464 VPValue *NewMask = GetNewMask(L->getMask());
2465 VPValue *NewAddr = GetNewAddr(L->getAddr());
2466 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2467 })
2468 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2469 VPValue *NewMask = GetNewMask(S->getMask());
2470 VPValue *NewAddr = GetNewAddr(S->getAddr());
2471 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2472 })
2473 .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2474 VPValue *NewMask = GetNewMask(IR->getMask());
2475 return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2476 })
2477 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2478 VPValue *NewMask = GetNewMask(Red->getCondOp());
2479 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2480 })
2481 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2482 VPValue *LHS, *RHS;
2483 // Transform select with a header mask condition
2484 // select(header_mask, LHS, RHS)
2485 // into vector predication merge.
2486 // vp.merge(all-true, LHS, RHS, EVL)
2487 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2488 m_VPValue(RHS))))
2489 return nullptr;
2490 // Use all true as the condition because this transformation is
2491 // limited to selects whose condition is a header mask.
2492 return new VPWidenIntrinsicRecipe(
2493 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2494 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2495 })
2496 .Default([&](VPRecipeBase *R) { return nullptr; });
2497}
2498
2499/// Replace recipes with their EVL variants.
2501 VPTypeAnalysis TypeInfo(Plan);
2502 VPValue *AllOneMask = Plan.getTrue();
2503 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2504 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2505
2506 assert(all_of(Plan.getVF().users(),
2509 "User of VF that we can't transform to EVL.");
2510 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2512 });
2513
2514 assert(all_of(Plan.getVFxUF().users(),
2515 [&Plan](VPUser *U) {
2516 return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),
2517 m_Specific(&Plan.getVFxUF()))) ||
2518 isa<VPWidenPointerInductionRecipe>(U);
2519 }) &&
2520 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2521 "increment of the canonical induction.");
2522 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2523 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2524 // canonical induction must not be updated.
2526 });
2527
2528 // Defer erasing recipes till the end so that we don't invalidate the
2529 // VPTypeAnalysis cache.
2531
2532 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2533 // contained.
2534 bool ContainsFORs =
2536 if (ContainsFORs) {
2537 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2538 VPValue *MaxEVL = &Plan.getVF();
2539 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2540 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2541 MaxEVL = Builder.createScalarZExtOrTrunc(
2542 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2543 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2544
2545 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2546 VPValue *PrevEVL = Builder.createScalarPhi(
2547 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2548
2551 for (VPRecipeBase &R : *VPBB) {
2552 VPValue *V1, *V2;
2553 if (!match(&R,
2555 m_VPValue(V1), m_VPValue(V2))))
2556 continue;
2557 VPValue *Imm = Plan.getOrAddLiveIn(
2560 Intrinsic::experimental_vp_splice,
2561 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2562 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2563 VPSplice->insertBefore(&R);
2564 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2565 ToErase.push_back(&R);
2566 }
2567 }
2568 }
2569
2570 VPValue *HeaderMask = findHeaderMask(Plan);
2571 if (!HeaderMask)
2572 return;
2573
2574 // Replace header masks with a mask equivalent to predicating by EVL:
2575 //
2576 // icmp ule widen-canonical-iv backedge-taken-count
2577 // ->
2578 // icmp ult step-vector, EVL
2579 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2580 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2581 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2582 VPValue *EVLMask = Builder.createICmp(
2584 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2585 HeaderMask->replaceAllUsesWith(EVLMask);
2586 ToErase.push_back(HeaderMask->getDefiningRecipe());
2587
2588 // Try to optimize header mask recipes away to their EVL variants.
2589 // TODO: Split optimizeMaskToEVL out and move into
2590 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2591 // tryToBuildVPlanWithVPRecipes beforehand.
2592 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2593 auto *CurRecipe = cast<VPRecipeBase>(U);
2594 VPRecipeBase *EVLRecipe =
2595 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2596 if (!EVLRecipe)
2597 continue;
2598
2599 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2600 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2601 "New recipe must define the same number of values as the "
2602 "original.");
2603 EVLRecipe->insertBefore(CurRecipe);
2605 EVLRecipe)) {
2606 for (unsigned I = 0; I < NumDefVal; ++I) {
2607 VPValue *CurVPV = CurRecipe->getVPValue(I);
2608 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2609 }
2610 }
2611 ToErase.push_back(CurRecipe);
2612 }
2613 // Remove dead EVL mask.
2614 if (EVLMask->getNumUsers() == 0)
2615 ToErase.push_back(EVLMask->getDefiningRecipe());
2616
2617 for (VPRecipeBase *R : reverse(ToErase)) {
2618 SmallVector<VPValue *> PossiblyDead(R->operands());
2619 R->eraseFromParent();
2620 for (VPValue *Op : PossiblyDead)
2622 }
2623}
2624
2625/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2626/// replaces all uses except the canonical IV increment of
2627/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2628/// is used only for loop iterations counting after this transformation.
2629///
2630/// The function uses the following definitions:
2631/// %StartV is the canonical induction start value.
2632///
2633/// The function adds the following recipes:
2634///
2635/// vector.ph:
2636/// ...
2637///
2638/// vector.body:
2639/// ...
2640/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2641/// [ %NextEVLIV, %vector.body ]
2642/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2643/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2644/// ...
2645/// %OpEVL = cast i32 %VPEVL to IVSize
2646/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2647/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2648/// ...
2649///
2650/// If MaxSafeElements is provided, the function adds the following recipes:
2651/// vector.ph:
2652/// ...
2653///
2654/// vector.body:
2655/// ...
2656/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2657/// [ %NextEVLIV, %vector.body ]
2658/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2659/// %cmp = cmp ult %AVL, MaxSafeElements
2660/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2661/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2662/// ...
2663/// %OpEVL = cast i32 %VPEVL to IVSize
2664/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2665/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2666/// ...
2667///
2669 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2671
2672 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2673 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2674 VPValue *StartV = CanonicalIVPHI->getStartValue();
2675
2676 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2677 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2678 EVLPhi->insertAfter(CanonicalIVPHI);
2679 VPBuilder Builder(Header, Header->getFirstNonPhi());
2680 // Create the AVL (application vector length), starting from TC -> 0 in steps
2681 // of EVL.
2682 VPPhi *AVLPhi = Builder.createScalarPhi(
2683 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2684 VPValue *AVL = AVLPhi;
2685
2686 if (MaxSafeElements) {
2687 // Support for MaxSafeDist for correct loop emission.
2688 VPValue *AVLSafe =
2689 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2690 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2691 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2692 "safe_avl");
2693 }
2694 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2696
2697 auto *CanonicalIVIncrement =
2698 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2699 Builder.setInsertPoint(CanonicalIVIncrement);
2700 VPValue *OpVPEVL = VPEVL;
2701
2702 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2703 OpVPEVL = Builder.createScalarZExtOrTrunc(
2704 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2705
2706 auto *NextEVLIV = Builder.createOverflowingOp(
2707 Instruction::Add, {OpVPEVL, EVLPhi},
2708 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2709 CanonicalIVIncrement->hasNoSignedWrap()},
2710 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2711 EVLPhi->addOperand(NextEVLIV);
2712
2713 VPValue *NextAVL = Builder.createOverflowingOp(
2714 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2715 DebugLoc::getCompilerGenerated(), "avl.next");
2716 AVLPhi->addOperand(NextAVL);
2717
2718 transformRecipestoEVLRecipes(Plan, *VPEVL);
2719
2720 // Replace all uses of VPCanonicalIVPHIRecipe by
2721 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2722 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2723 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2724 // TODO: support unroll factor > 1.
2725 Plan.setUF(1);
2726}
2727
2729 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2730 // There should be only one EVL PHI in the entire plan.
2731 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2732
2735 for (VPRecipeBase &R : VPBB->phis())
2736 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2737 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2738 EVLPhi = PhiR;
2739 }
2740
2741 // Early return if no EVL PHI is found.
2742 if (!EVLPhi)
2743 return;
2744
2745 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2746 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2747 VPValue *AVL;
2748 [[maybe_unused]] bool FoundAVL =
2749 match(EVLIncrement,
2750 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2751 assert(FoundAVL && "Didn't find AVL?");
2752
2753 // The AVL may be capped to a safe distance.
2754 VPValue *SafeAVL;
2755 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2756 AVL = SafeAVL;
2757
2758 VPValue *AVLNext;
2759 [[maybe_unused]] bool FoundAVLNext =
2761 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2762 assert(FoundAVLNext && "Didn't find AVL backedge?");
2763
2764 // Convert EVLPhi to concrete recipe.
2765 auto *ScalarR =
2766 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2767 EVLPhi->getDebugLoc(), "evl.based.iv");
2768 EVLPhi->replaceAllUsesWith(ScalarR);
2769 EVLPhi->eraseFromParent();
2770
2771 // Replace CanonicalIVInc with EVL-PHI increment.
2772 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2773 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2774 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2775 m_Specific(&Plan.getVFxUF()))) &&
2776 "Unexpected canonical iv");
2777 Backedge->replaceAllUsesWith(EVLIncrement);
2778
2779 // Remove unused phi and increment.
2780 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2781 CanonicalIVIncrement->eraseFromParent();
2782 CanonicalIV->eraseFromParent();
2783
2784 // Replace the use of VectorTripCount in the latch-exiting block.
2785 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2786 // After: (branch-on-cond eq AVLNext, 0)
2787
2788 VPBasicBlock *LatchExiting =
2789 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2790 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2791 // Skip single-iteration loop region
2792 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2793 return;
2794 assert(LatchExitingBr &&
2795 match(LatchExitingBr,
2796 m_BranchOnCount(m_VPValue(EVLIncrement),
2797 m_Specific(&Plan.getVectorTripCount()))) &&
2798 "Unexpected terminator in EVL loop");
2799
2800 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2801 VPBuilder Builder(LatchExitingBr);
2802 VPValue *Cmp =
2803 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2805 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2806 LatchExitingBr->eraseFromParent();
2807}
2808
2810 VPlan &Plan, PredicatedScalarEvolution &PSE,
2811 const DenseMap<Value *, const SCEV *> &StridesMap) {
2812 // Replace VPValues for known constant strides guaranteed by predicate scalar
2813 // evolution.
2814 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2815 auto *R = cast<VPRecipeBase>(&U);
2816 return R->getParent()->getParent() ||
2817 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2818 };
2819 for (const SCEV *Stride : StridesMap.values()) {
2820 using namespace SCEVPatternMatch;
2821 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2822 const APInt *StrideConst;
2823 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2824 // Only handle constant strides for now.
2825 continue;
2826
2827 auto *CI =
2828 Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
2829 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2830 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2831
2832 // The versioned value may not be used in the loop directly but through a
2833 // sext/zext. Add new live-ins in those cases.
2834 for (Value *U : StrideV->users()) {
2836 continue;
2837 VPValue *StrideVPV = Plan.getLiveIn(U);
2838 if (!StrideVPV)
2839 continue;
2840 unsigned BW = U->getType()->getScalarSizeInBits();
2841 APInt C =
2842 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2843 VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
2844 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2845 }
2846 }
2847}
2848
2850 VPlan &Plan,
2851 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2852 // Collect recipes in the backward slice of `Root` that may generate a poison
2853 // value that is used after vectorization.
2855 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2857 Worklist.push_back(Root);
2858
2859 // Traverse the backward slice of Root through its use-def chain.
2860 while (!Worklist.empty()) {
2861 VPRecipeBase *CurRec = Worklist.pop_back_val();
2862
2863 if (!Visited.insert(CurRec).second)
2864 continue;
2865
2866 // Prune search if we find another recipe generating a widen memory
2867 // instruction. Widen memory instructions involved in address computation
2868 // will lead to gather/scatter instructions, which don't need to be
2869 // handled.
2871 VPHeaderPHIRecipe>(CurRec))
2872 continue;
2873
2874 // This recipe contributes to the address computation of a widen
2875 // load/store. If the underlying instruction has poison-generating flags,
2876 // drop them directly.
2877 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2878 VPValue *A, *B;
2879 // Dropping disjoint from an OR may yield incorrect results, as some
2880 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2881 // for dependence analysis). Instead, replace it with an equivalent Add.
2882 // This is possible as all users of the disjoint OR only access lanes
2883 // where the operands are disjoint or poison otherwise.
2884 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
2885 RecWithFlags->isDisjoint()) {
2886 VPBuilder Builder(RecWithFlags);
2887 VPInstruction *New = Builder.createOverflowingOp(
2888 Instruction::Add, {A, B}, {false, false},
2889 RecWithFlags->getDebugLoc());
2890 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2891 RecWithFlags->replaceAllUsesWith(New);
2892 RecWithFlags->eraseFromParent();
2893 CurRec = New;
2894 } else
2895 RecWithFlags->dropPoisonGeneratingFlags();
2896 } else {
2899 (void)Instr;
2900 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2901 "found instruction with poison generating flags not covered by "
2902 "VPRecipeWithIRFlags");
2903 }
2904
2905 // Add new definitions to the worklist.
2906 for (VPValue *Operand : CurRec->operands())
2907 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2908 Worklist.push_back(OpDef);
2909 }
2910 });
2911
2912 // Traverse all the recipes in the VPlan and collect the poison-generating
2913 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2914 // VPInterleaveRecipe.
2915 auto Iter = vp_depth_first_deep(Plan.getEntry());
2917 for (VPRecipeBase &Recipe : *VPBB) {
2918 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
2919 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2920 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2921 if (AddrDef && WidenRec->isConsecutive() &&
2922 BlockNeedsPredication(UnderlyingInstr.getParent()))
2923 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2924 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
2925 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2926 if (AddrDef) {
2927 // Check if any member of the interleave group needs predication.
2928 const InterleaveGroup<Instruction> *InterGroup =
2929 InterleaveRec->getInterleaveGroup();
2930 bool NeedPredication = false;
2931 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2932 I < NumMembers; ++I) {
2933 Instruction *Member = InterGroup->getMember(I);
2934 if (Member)
2935 NeedPredication |= BlockNeedsPredication(Member->getParent());
2936 }
2937
2938 if (NeedPredication)
2939 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2940 }
2941 }
2942 }
2943 }
2944}
2945
2947 VPlan &Plan,
2949 &InterleaveGroups,
2950 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2951 if (InterleaveGroups.empty())
2952 return;
2953
2954 // Interleave memory: for each Interleave Group we marked earlier as relevant
2955 // for this VPlan, replace the Recipes widening its memory instructions with a
2956 // single VPInterleaveRecipe at its insertion point.
2957 VPDominatorTree VPDT;
2958 VPDT.recalculate(Plan);
2959 for (const auto *IG : InterleaveGroups) {
2960 auto *Start =
2961 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
2962 VPIRMetadata InterleaveMD(*Start);
2963 SmallVector<VPValue *, 4> StoredValues;
2964 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
2965 StoredValues.push_back(StoreR->getStoredValue());
2966 for (unsigned I = 1; I < IG->getFactor(); ++I) {
2967 Instruction *MemberI = IG->getMember(I);
2968 if (!MemberI)
2969 continue;
2970 VPWidenMemoryRecipe *MemoryR =
2971 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
2972 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
2973 StoredValues.push_back(StoreR->getStoredValue());
2974 InterleaveMD.intersect(*MemoryR);
2975 }
2976
2977 bool NeedsMaskForGaps =
2978 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
2979 (!StoredValues.empty() && !IG->isFull());
2980
2981 Instruction *IRInsertPos = IG->getInsertPos();
2982 auto *InsertPos =
2983 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
2984
2986 if (auto *Gep = dyn_cast<GetElementPtrInst>(
2987 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
2988 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
2989
2990 // Get or create the start address for the interleave group.
2991 VPValue *Addr = Start->getAddr();
2992 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2993 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
2994 // We cannot re-use the address of member zero because it does not
2995 // dominate the insert position. Instead, use the address of the insert
2996 // position and create a PtrAdd adjusting it to the address of member
2997 // zero.
2998 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2999 // InsertPos or sink loads above zero members to join it.
3000 assert(IG->getIndex(IRInsertPos) != 0 &&
3001 "index of insert position shouldn't be zero");
3002 auto &DL = IRInsertPos->getDataLayout();
3003 APInt Offset(32,
3004 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3005 IG->getIndex(IRInsertPos),
3006 /*IsSigned=*/true);
3007 VPValue *OffsetVPV =
3008 Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
3009 VPBuilder B(InsertPos);
3010 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3011 }
3012 // If the group is reverse, adjust the index to refer to the last vector
3013 // lane instead of the first. We adjust the index from the first vector
3014 // lane, rather than directly getting the pointer for lane VF - 1, because
3015 // the pointer operand of the interleaved access is supposed to be uniform.
3016 if (IG->isReverse()) {
3017 auto *ReversePtr = new VPVectorEndPointerRecipe(
3018 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3019 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3020 ReversePtr->insertBefore(InsertPos);
3021 Addr = ReversePtr;
3022 }
3023 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3024 InsertPos->getMask(), NeedsMaskForGaps,
3025 InterleaveMD, InsertPos->getDebugLoc());
3026 VPIG->insertBefore(InsertPos);
3027
3028 unsigned J = 0;
3029 for (unsigned i = 0; i < IG->getFactor(); ++i)
3030 if (Instruction *Member = IG->getMember(i)) {
3031 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3032 if (!Member->getType()->isVoidTy()) {
3033 VPValue *OriginalV = MemberR->getVPSingleValue();
3034 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3035 J++;
3036 }
3037 MemberR->eraseFromParent();
3038 }
3039 }
3040}
3041
3042/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3043/// value, phi and backedge value. In the following example:
3044///
3045/// vector.ph:
3046/// Successor(s): vector loop
3047///
3048/// <x1> vector loop: {
3049/// vector.body:
3050/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3051/// ...
3052/// EMIT branch-on-count ...
3053/// No successors
3054/// }
3055///
3056/// WIDEN-INDUCTION will get expanded to:
3057///
3058/// vector.ph:
3059/// ...
3060/// vp<%induction.start> = ...
3061/// vp<%induction.increment> = ...
3062///
3063/// Successor(s): vector loop
3064///
3065/// <x1> vector loop: {
3066/// vector.body:
3067/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3068/// ...
3069/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3070/// EMIT branch-on-count ...
3071/// No successors
3072/// }
3073static void
3075 VPTypeAnalysis &TypeInfo) {
3076 VPlan *Plan = WidenIVR->getParent()->getPlan();
3077 VPValue *Start = WidenIVR->getStartValue();
3078 VPValue *Step = WidenIVR->getStepValue();
3079 VPValue *VF = WidenIVR->getVFValue();
3080 DebugLoc DL = WidenIVR->getDebugLoc();
3081
3082 // The value from the original loop to which we are mapping the new induction
3083 // variable.
3084 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3085
3086 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3089 // FIXME: The newly created binary instructions should contain nsw/nuw
3090 // flags, which can be found from the original scalar operations.
3091 VPIRFlags Flags;
3092 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3093 AddOp = Instruction::Add;
3094 MulOp = Instruction::Mul;
3095 } else {
3096 AddOp = ID.getInductionOpcode();
3097 MulOp = Instruction::FMul;
3098 Flags = ID.getInductionBinOp()->getFastMathFlags();
3099 }
3100
3101 // If the phi is truncated, truncate the start and step values.
3102 VPBuilder Builder(Plan->getVectorPreheader());
3103 Type *StepTy = TypeInfo.inferScalarType(Step);
3104 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3105 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3106 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3107 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3108 StepTy = Ty;
3109 }
3110
3111 // Construct the initial value of the vector IV in the vector loop preheader.
3112 Type *IVIntTy =
3114 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3115 if (StepTy->isFloatingPointTy())
3116 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3117
3118 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3119 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3120
3121 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3122 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3123 DebugLoc::getUnknown(), "induction");
3124
3125 // Create the widened phi of the vector IV.
3126 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
3127 WidenIVR->getDebugLoc(), "vec.ind");
3128 WidePHI->addOperand(Init);
3129 WidePHI->insertBefore(WidenIVR);
3130
3131 // Create the backedge value for the vector IV.
3132 VPValue *Inc;
3133 VPValue *Prev;
3134 // If unrolled, use the increment and prev value from the operands.
3135 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3136 Inc = SplatVF;
3137 Prev = WidenIVR->getLastUnrolledPartOperand();
3138 } else {
3139 if (VPRecipeBase *R = VF->getDefiningRecipe())
3140 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3141 // Multiply the vectorization factor by the step using integer or
3142 // floating-point arithmetic as appropriate.
3143 if (StepTy->isFloatingPointTy())
3144 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3145 DL);
3146 else
3147 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3148 TypeInfo.inferScalarType(VF), DL);
3149
3150 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3151 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3152 Prev = WidePHI;
3153 }
3154
3156 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3157 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3158 WidenIVR->getDebugLoc(), "vec.ind.next");
3159
3160 WidePHI->addOperand(Next);
3161
3162 WidenIVR->replaceAllUsesWith(WidePHI);
3163}
3164
3165/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3166/// initial value, phi and backedge value. In the following example:
3167///
3168/// <x1> vector loop: {
3169/// vector.body:
3170/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3171/// ...
3172/// EMIT branch-on-count ...
3173/// }
3174///
3175/// WIDEN-POINTER-INDUCTION will get expanded to:
3176///
3177/// <x1> vector loop: {
3178/// vector.body:
3179/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3180/// EMIT %mul = mul %stepvector, %step
3181/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3182/// ...
3183/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3184/// EMIT branch-on-count ...
3185/// }
3187 VPTypeAnalysis &TypeInfo) {
3188 VPlan *Plan = R->getParent()->getPlan();
3189 VPValue *Start = R->getStartValue();
3190 VPValue *Step = R->getStepValue();
3191 VPValue *VF = R->getVFValue();
3192
3193 assert(R->getInductionDescriptor().getKind() ==
3195 "Not a pointer induction according to InductionDescriptor!");
3196 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3197 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3198 "Recipe should have been replaced");
3199
3200 VPBuilder Builder(R);
3201 DebugLoc DL = R->getDebugLoc();
3202
3203 // Build a scalar pointer phi.
3204 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3205
3206 // Create actual address geps that use the pointer phi as base and a
3207 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3208 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3209 Type *StepTy = TypeInfo.inferScalarType(Step);
3210 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3211 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3212 VPValue *PtrAdd = Builder.createNaryOp(
3213 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3214 R->replaceAllUsesWith(PtrAdd);
3215
3216 // Create the backedge value for the scalar pointer phi.
3218 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3219 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3220 DL);
3221 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3222
3223 VPValue *InductionGEP =
3224 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3225 ScalarPtrPhi->addOperand(InductionGEP);
3226}
3227
3229 // Replace loop regions with explicity CFG.
3230 SmallVector<VPRegionBlock *> LoopRegions;
3232 vp_depth_first_deep(Plan.getEntry()))) {
3233 if (!R->isReplicator())
3234 LoopRegions.push_back(R);
3235 }
3236 for (VPRegionBlock *R : LoopRegions)
3237 R->dissolveToCFGLoop();
3238}
3239
3241 VPTypeAnalysis TypeInfo(Plan);
3244 vp_depth_first_deep(Plan.getEntry()))) {
3245 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3246 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3247 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3248 ToRemove.push_back(WidenIVR);
3249 continue;
3250 }
3251
3252 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3253 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3254 ToRemove.push_back(WidenIVR);
3255 continue;
3256 }
3257
3258 // Expand VPBlendRecipe into VPInstruction::Select.
3259 VPBuilder Builder(&R);
3260 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3261 VPValue *Select = Blend->getIncomingValue(0);
3262 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3263 Select = Builder.createSelect(Blend->getMask(I),
3264 Blend->getIncomingValue(I), Select,
3265 R.getDebugLoc(), "predphi");
3266 Blend->replaceAllUsesWith(Select);
3267 ToRemove.push_back(Blend);
3268 }
3269
3270 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3271 Expr->decompose();
3272 ToRemove.push_back(Expr);
3273 }
3274
3275 VPValue *VectorStep;
3276 VPValue *ScalarStep;
3278 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3279 continue;
3280
3281 // Expand WideIVStep.
3282 auto *VPI = cast<VPInstruction>(&R);
3283 Type *IVTy = TypeInfo.inferScalarType(VPI);
3284 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3286 ? Instruction::UIToFP
3287 : Instruction::Trunc;
3288 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3289 }
3290
3291 [[maybe_unused]] auto *ConstStep =
3292 ScalarStep->isLiveIn()
3294 : nullptr;
3295 assert(!ConstStep || ConstStep->getValue() != 1);
3296 (void)ConstStep;
3297 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3298 ScalarStep =
3299 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3300 }
3301
3302 VPIRFlags Flags;
3303 if (IVTy->isFloatingPointTy())
3304 Flags = {VPI->getFastMathFlags()};
3305
3306 unsigned MulOpc =
3307 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3308 VPInstruction *Mul = Builder.createNaryOp(
3309 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3310 VectorStep = Mul;
3311 VPI->replaceAllUsesWith(VectorStep);
3312 ToRemove.push_back(VPI);
3313 }
3314 }
3315
3316 for (VPRecipeBase *R : ToRemove)
3317 R->eraseFromParent();
3318}
3319
3321 VPBasicBlock *EarlyExitVPBB,
3322 VPlan &Plan,
3323 VPBasicBlock *HeaderVPBB,
3324 VPBasicBlock *LatchVPBB) {
3325 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3326 if (!EarlyExitVPBB->getSinglePredecessor() &&
3327 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3328 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3329 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3330 "unsupported early exit VPBB");
3331 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3332 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3333 // of the phis.
3334 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3335 cast<VPIRPhi>(&R)->swapOperands();
3336 }
3337
3338 VPBuilder Builder(LatchVPBB->getTerminator());
3339 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3340 assert(
3341 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
3342 "Terminator must be be BranchOnCond");
3343 VPValue *CondOfEarlyExitingVPBB =
3344 EarlyExitingVPBB->getTerminator()->getOperand(0);
3345 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3346 ? CondOfEarlyExitingVPBB
3347 : Builder.createNot(CondOfEarlyExitingVPBB);
3348
3349 // Split the middle block and have it conditionally branch to the early exit
3350 // block if CondToEarlyExit.
3351 VPValue *IsEarlyExitTaken =
3352 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3353 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3354 VPBasicBlock *VectorEarlyExitVPBB =
3355 Plan.createVPBasicBlock("vector.early.exit");
3356 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3357 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3358 NewMiddle->swapSuccessors();
3359
3360 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3361
3362 // Update the exit phis in the early exit block.
3363 VPBuilder MiddleBuilder(NewMiddle);
3364 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3365 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3366 auto *ExitIRI = cast<VPIRPhi>(&R);
3367 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3368 // a single predecessor and 1 if it has two.
3369 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3370 if (ExitIRI->getNumOperands() != 1) {
3371 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3372 // predecessor. Extract its last lane.
3373 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3374 }
3375
3376 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3377 if (!IncomingFromEarlyExit->isLiveIn()) {
3378 // Update the incoming value from the early exit.
3379 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3380 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3381 "first.active.lane");
3382 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3383 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3384 nullptr, "early.exit.value");
3385 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3386 }
3387 }
3388 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3389
3390 // Replace the condition controlling the non-early exit from the vector loop
3391 // with one exiting if either the original condition of the vector latch is
3392 // true or the early exit has been taken.
3393 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3394 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3395 "Unexpected terminator");
3396 auto *IsLatchExitTaken =
3397 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3398 LatchExitingBranch->getOperand(1));
3399 auto *AnyExitTaken = Builder.createNaryOp(
3400 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3401 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3402 LatchExitingBranch->eraseFromParent();
3403}
3404
3405/// This function tries convert extended in-loop reductions to
3406/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3407/// valid. The created recipe must be decomposed to its constituent
3408/// recipes before execution.
3409static VPExpressionRecipe *
3411 VFRange &Range) {
3412 Type *RedTy = Ctx.Types.inferScalarType(Red);
3413 VPValue *VecOp = Red->getVecOp();
3414
3415 // Clamp the range if using extended-reduction is profitable.
3416 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
3417 Type *SrcTy) -> bool {
3419 [&](ElementCount VF) {
3420 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3422 InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3423 Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
3424 CostKind);
3425 InstructionCost ExtCost =
3426 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3427 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3428 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3429 },
3430 Range);
3431 };
3432
3433 VPValue *A;
3434 // Match reduce(ext)).
3435 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3436 IsExtendedRedValidAndClampRange(
3437 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3438 cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
3439 Instruction::CastOps::ZExt,
3440 Ctx.Types.inferScalarType(A)))
3441 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3442
3443 return nullptr;
3444}
3445
3446/// This function tries convert extended in-loop reductions to
3447/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3448/// and valid. The created VPExpressionRecipe must be decomposed to its
3449/// constituent recipes before execution. Patterns of the
3450/// VPExpressionRecipe:
3451/// reduce.add(mul(...)),
3452/// reduce.add(mul(ext(A), ext(B))),
3453/// reduce.add(ext(mul(ext(A), ext(B)))).
3454static VPExpressionRecipe *
3456 VPCostContext &Ctx, VFRange &Range) {
3457 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3458 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3459 return nullptr;
3460
3461 Type *RedTy = Ctx.Types.inferScalarType(Red);
3462
3463 // Clamp the range if using multiply-accumulate-reduction is profitable.
3464 auto IsMulAccValidAndClampRange =
3465 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
3466 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
3468 [&](ElementCount VF) {
3470 Type *SrcTy =
3471 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3472 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3473 InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
3474 isZExt, Opcode, RedTy, SrcVecTy, CostKind);
3475 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3476 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3477 InstructionCost ExtCost = 0;
3478 if (Ext0)
3479 ExtCost += Ext0->computeCost(VF, Ctx);
3480 if (Ext1)
3481 ExtCost += Ext1->computeCost(VF, Ctx);
3482 if (OuterExt)
3483 ExtCost += OuterExt->computeCost(VF, Ctx);
3484
3485 return MulAccCost.isValid() &&
3486 MulAccCost < ExtCost + MulCost + RedCost;
3487 },
3488 Range);
3489 };
3490
3491 VPValue *VecOp = Red->getVecOp();
3492 VPValue *A, *B;
3493 // Try to match reduce.add(mul(...)).
3494 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3495 auto *RecipeA =
3496 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3497 auto *RecipeB =
3498 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3499 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3500
3501 // Match reduce.add(mul(ext, ext)).
3502 if (RecipeA && RecipeB &&
3503 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
3504 match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3505 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3506 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
3507 Instruction::CastOps::ZExt,
3508 Mul, RecipeA, RecipeB, nullptr)) {
3509 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3510 }
3511 // Match reduce.add(mul).
3512 if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
3513 return new VPExpressionRecipe(Mul, Red);
3514 }
3515 // Match reduce.add(ext(mul(ext(A), ext(B)))).
3516 // All extend recipes must have same opcode or A == B
3517 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
3519 m_ZExtOrSExt(m_VPValue()))))) {
3520 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3521 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3522 auto *Ext0 =
3523 cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
3524 auto *Ext1 =
3525 cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
3526 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3527 Ext0->getOpcode() == Ext1->getOpcode() &&
3528 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
3529 Instruction::CastOps::ZExt,
3530 Mul, Ext0, Ext1, Ext)) {
3531 auto *NewExt0 = new VPWidenCastRecipe(
3532 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3533 Ext0->getDebugLoc());
3534 NewExt0->insertBefore(Ext0);
3535
3536 VPWidenCastRecipe *NewExt1 = NewExt0;
3537 if (Ext0 != Ext1) {
3538 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3539 Ext->getResultType(), *Ext1,
3540 Ext1->getDebugLoc());
3541 NewExt1->insertBefore(Ext1);
3542 }
3543 Mul->setOperand(0, NewExt0);
3544 Mul->setOperand(1, NewExt1);
3545 Red->setOperand(1, Mul);
3546 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3547 }
3548 }
3549 return nullptr;
3550}
3551
3552/// This function tries to create abstract recipes from the reduction recipe for
3553/// following optimizations and cost estimation.
3555 VPCostContext &Ctx,
3556 VFRange &Range) {
3557 VPExpressionRecipe *AbstractR = nullptr;
3558 auto IP = std::next(Red->getIterator());
3559 auto *VPBB = Red->getParent();
3560 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3561 AbstractR = MulAcc;
3562 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3563 AbstractR = ExtRed;
3564 // Cannot create abstract inloop reduction recipes.
3565 if (!AbstractR)
3566 return;
3567
3568 AbstractR->insertBefore(*VPBB, IP);
3569 Red->replaceAllUsesWith(AbstractR);
3570}
3571
3582
3584 if (Plan.hasScalarVFOnly())
3585 return;
3586
3587#ifndef NDEBUG
3588 VPDominatorTree VPDT;
3589 VPDT.recalculate(Plan);
3590#endif
3591
3592 SmallVector<VPValue *> VPValues;
3595 append_range(VPValues, Plan.getLiveIns());
3596 for (VPRecipeBase &R : *Plan.getEntry())
3597 append_range(VPValues, R.definedValues());
3598
3599 auto *VectorPreheader = Plan.getVectorPreheader();
3600 for (VPValue *VPV : VPValues) {
3602 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3603 isa<Constant>(VPV->getLiveInIRValue())))
3604 continue;
3605
3606 // Add explicit broadcast at the insert point that dominates all users.
3607 VPBasicBlock *HoistBlock = VectorPreheader;
3608 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3609 for (VPUser *User : VPV->users()) {
3610 if (User->usesScalars(VPV))
3611 continue;
3612 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3613 HoistPoint = HoistBlock->begin();
3614 else
3615 assert(VPDT.dominates(VectorPreheader,
3616 cast<VPRecipeBase>(User)->getParent()) &&
3617 "All users must be in the vector preheader or dominated by it");
3618 }
3619
3620 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3621 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3622 VPV->replaceUsesWithIf(Broadcast,
3623 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3624 return Broadcast != &U && !U.usesScalars(VPV);
3625 });
3626 }
3627}
3628
3630 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3632 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3633 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3634
3635 VPValue *TC = Plan.getTripCount();
3636 // Skip cases for which the trip count may be non-trivial to materialize.
3637 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3638 // tail is required.
3639 if (!Plan.hasScalarTail() ||
3641 Plan.getScalarPreheader() ||
3642 !TC->isLiveIn())
3643 return;
3644
3645 // Materialize vector trip counts for constants early if it can simply
3646 // be computed as (Original TC / VF * UF) * VF * UF.
3647 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3648 // tail-folded loops.
3649 ScalarEvolution &SE = *PSE.getSE();
3650 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3651 if (!isa<SCEVConstant>(TCScev))
3652 return;
3653 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3654 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3655 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3656 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3657}
3658
3660 VPBasicBlock *VectorPH) {
3662 if (BTC->getNumUsers() == 0)
3663 return;
3664
3665 VPBuilder Builder(VectorPH, VectorPH->begin());
3666 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3667 auto *TCMO = Builder.createNaryOp(
3668 Instruction::Sub,
3669 {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
3670 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3671 BTC->replaceAllUsesWith(TCMO);
3672}
3673
3675 if (Plan.hasScalarVFOnly())
3676 return;
3677
3678 VPTypeAnalysis TypeInfo(Plan);
3679 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3680 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3682 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3683 vp_depth_first_shallow(LoopRegion->getEntry()));
3684 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
3685 // excluding ones in replicate regions. Those are not materialized explicitly
3686 // yet. Those vector users are still handled in VPReplicateRegion::execute(),
3687 // via shouldPack().
3688 // TODO: materialize build vectors for replicating recipes in replicating
3689 // regions.
3690 // TODO: materialize build vectors for VPInstructions.
3691 for (VPBasicBlock *VPBB :
3692 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3693 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3694 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3695 auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
3696 VPRegionBlock *ParentRegion =
3698 return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
3699 };
3700 if (!RepR || RepR->isSingleScalar() ||
3701 none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
3702 continue;
3703
3704 Type *ScalarTy = TypeInfo.inferScalarType(RepR);
3705 unsigned Opcode = ScalarTy->isStructTy()
3708 auto *BuildVector = new VPInstruction(Opcode, {RepR});
3709 BuildVector->insertAfter(RepR);
3710
3711 RepR->replaceUsesWithIf(
3712 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3713 VPUser &U, unsigned) {
3714 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3715 });
3716 }
3717 }
3718}
3719
3721 VPBasicBlock *VectorPHVPBB,
3722 bool TailByMasking,
3723 bool RequiresScalarEpilogue) {
3724 VPValue &VectorTC = Plan.getVectorTripCount();
3725 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3726 // There's nothing to do if there are no users of the vector trip count or its
3727 // IR value has already been set.
3728 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3729 return;
3730
3731 VPValue *TC = Plan.getTripCount();
3732 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3733 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3734 VPValue *Step = &Plan.getVFxUF();
3735
3736 // If the tail is to be folded by masking, round the number of iterations N
3737 // up to a multiple of Step instead of rounding down. This is done by first
3738 // adding Step-1 and then rounding down. Note that it's ok if this addition
3739 // overflows: the vector induction variable will eventually wrap to zero given
3740 // that it starts at zero and its Step is a power of two; the loop will then
3741 // exit, with the last early-exit vector comparison also producing all-true.
3742 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3743 // is accounted for in emitIterationCountCheck that adds an overflow check.
3744 if (TailByMasking) {
3745 TC = Builder.createNaryOp(
3746 Instruction::Add,
3747 {TC, Builder.createNaryOp(
3748 Instruction::Sub,
3749 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3750 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3751 }
3752
3753 // Now we need to generate the expression for the part of the loop that the
3754 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3755 // iterations are not required for correctness, or N - Step, otherwise. Step
3756 // is equal to the vectorization factor (number of SIMD elements) times the
3757 // unroll factor (number of SIMD instructions).
3758 VPValue *R =
3759 Builder.createNaryOp(Instruction::URem, {TC, Step},
3760 DebugLoc::getCompilerGenerated(), "n.mod.vf");
3761
3762 // There are cases where we *must* run at least one iteration in the remainder
3763 // loop. See the cost model for when this can happen. If the step evenly
3764 // divides the trip count, we set the remainder to be equal to the step. If
3765 // the step does not evenly divide the trip count, no adjustment is necessary
3766 // since there will already be scalar iterations. Note that the minimum
3767 // iterations check ensures that N >= Step.
3768 if (RequiresScalarEpilogue) {
3769 assert(!TailByMasking &&
3770 "requiring scalar epilogue is not supported with fail folding");
3771 VPValue *IsZero = Builder.createICmp(
3772 CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
3773 R = Builder.createSelect(IsZero, Step, R);
3774 }
3775
3776 VPValue *Res = Builder.createNaryOp(
3777 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
3778 VectorTC.replaceAllUsesWith(Res);
3779}
3780
3782 ElementCount VFEC) {
3783 VPBuilder Builder(VectorPH, VectorPH->begin());
3784 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3785 VPValue &VF = Plan.getVF();
3786 VPValue &VFxUF = Plan.getVFxUF();
3787 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
3788 // used.
3789 // TODO: Assert that they aren't used.
3790
3791 // If there are no users of the runtime VF, compute VFxUF by constant folding
3792 // the multiplication of VF and UF.
3793 if (VF.getNumUsers() == 0) {
3794 VPValue *RuntimeVFxUF =
3795 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
3796 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
3797 return;
3798 }
3799
3800 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
3801 // vscale) * UF.
3802 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
3804 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
3806 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
3807 }
3808 VF.replaceAllUsesWith(RuntimeVF);
3809
3810 VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
3811 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
3812 VFxUF.replaceAllUsesWith(MulByUF);
3813}
3814
3817 const DataLayout &DL = SE.getDataLayout();
3818 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
3819
3820 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
3821 BasicBlock *EntryBB = Entry->getIRBasicBlock();
3822 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
3823 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
3825 continue;
3826 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3827 if (!ExpSCEV)
3828 break;
3829 const SCEV *Expr = ExpSCEV->getSCEV();
3830 Value *Res =
3831 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
3832 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
3833 VPValue *Exp = Plan.getOrAddLiveIn(Res);
3834 ExpSCEV->replaceAllUsesWith(Exp);
3835 if (Plan.getTripCount() == ExpSCEV)
3836 Plan.resetTripCount(Exp);
3837 ExpSCEV->eraseFromParent();
3838 }
3840 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
3841 "after any VPIRInstructions");
3842 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
3843 // to the VPIRBasicBlock.
3844 auto EI = Entry->begin();
3845 for (Instruction &I : drop_end(*EntryBB)) {
3846 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
3847 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
3848 EI++;
3849 continue;
3850 }
3852 }
3853
3854 return ExpandedSCEVs;
3855}
3856
3857/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3858/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
3859/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
3860/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
3861/// an index-independent load if it feeds all wide ops at all indices (\p OpV
3862/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
3863/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3864/// is defined at \p Idx of a load interleave group.
3865static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
3866 VPValue *OpV, unsigned Idx) {
3867 auto *DefR = OpV->getDefiningRecipe();
3868 if (!DefR)
3869 return WideMember0->getOperand(OpIdx) == OpV;
3870 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
3871 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
3872
3873 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
3874 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
3875 return false;
3876}
3877
3878/// Returns true if \p IR is a full interleave group with factor and number of
3879/// members both equal to \p VF. The interleave group must also access the full
3880/// vector width \p VectorRegWidth.
3882 unsigned VF, VPTypeAnalysis &TypeInfo,
3883 unsigned VectorRegWidth) {
3884 if (!InterleaveR)
3885 return false;
3886
3887 Type *GroupElementTy = nullptr;
3888 if (InterleaveR->getStoredValues().empty()) {
3889 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
3890 if (!all_of(InterleaveR->definedValues(),
3891 [&TypeInfo, GroupElementTy](VPValue *Op) {
3892 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3893 }))
3894 return false;
3895 } else {
3896 GroupElementTy =
3897 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
3898 if (!all_of(InterleaveR->getStoredValues(),
3899 [&TypeInfo, GroupElementTy](VPValue *Op) {
3900 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3901 }))
3902 return false;
3903 }
3904
3905 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3906 auto IG = InterleaveR->getInterleaveGroup();
3907 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3908 GroupSize == VectorRegWidth;
3909}
3910
3911/// Returns true if \p VPValue is a narrow VPValue.
3912static bool isAlreadyNarrow(VPValue *VPV) {
3913 if (VPV->isLiveIn())
3914 return true;
3915 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
3916 return RepR && RepR->isSingleScalar();
3917}
3918
3920 unsigned VectorRegWidth) {
3921 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3922 if (!VectorLoop)
3923 return;
3924
3925 VPTypeAnalysis TypeInfo(Plan);
3926
3927 unsigned VFMinVal = VF.getKnownMinValue();
3929 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3932 continue;
3933
3936 continue;
3937
3938 // Bail out on recipes not supported at the moment:
3939 // * phi recipes other than the canonical induction
3940 // * recipes writing to memory except interleave groups
3941 // Only support plans with a canonical induction phi.
3942 if (R.isPhi())
3943 return;
3944
3945 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
3946 if (R.mayWriteToMemory() && !InterleaveR)
3947 return;
3948
3949 // Do not narrow interleave groups if there are VectorPointer recipes and
3950 // the plan was unrolled. The recipe implicitly uses VF from
3951 // VPTransformState.
3952 // TODO: Remove restriction once the VF for the VectorPointer offset is
3953 // modeled explicitly as operand.
3954 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
3955 return;
3956
3957 // All other ops are allowed, but we reject uses that cannot be converted
3958 // when checking all allowed consumers (store interleave groups) below.
3959 if (!InterleaveR)
3960 continue;
3961
3962 // Bail out on non-consecutive interleave groups.
3963 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
3964 VectorRegWidth))
3965 return;
3966
3967 // Skip read interleave groups.
3968 if (InterleaveR->getStoredValues().empty())
3969 continue;
3970
3971 // Narrow interleave groups, if all operands are already matching narrow
3972 // ops.
3973 auto *Member0 = InterleaveR->getStoredValues()[0];
3974 if (isAlreadyNarrow(Member0) &&
3975 all_of(InterleaveR->getStoredValues(),
3976 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
3977 StoreGroups.push_back(InterleaveR);
3978 continue;
3979 }
3980
3981 // For now, we only support full interleave groups storing load interleave
3982 // groups.
3983 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
3984 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3985 if (!DefR)
3986 return false;
3987 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
3988 return IR && IR->getInterleaveGroup()->isFull() &&
3989 IR->getVPValue(Op.index()) == Op.value();
3990 })) {
3991 StoreGroups.push_back(InterleaveR);
3992 continue;
3993 }
3994
3995 // Check if all values feeding InterleaveR are matching wide recipes, which
3996 // operands that can be narrowed.
3997 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3998 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
3999 if (!WideMember0)
4000 return;
4001 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4002 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4003 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4004 R->getNumOperands() > 2)
4005 return;
4006 if (any_of(enumerate(R->operands()),
4007 [WideMember0, Idx = I](const auto &P) {
4008 const auto &[OpIdx, OpV] = P;
4009 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4010 }))
4011 return;
4012 }
4013 StoreGroups.push_back(InterleaveR);
4014 }
4015
4016 if (StoreGroups.empty())
4017 return;
4018
4019 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4020 SmallPtrSet<VPValue *, 4> NarrowedOps;
4021 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
4022 auto *R = V->getDefiningRecipe();
4023 if (!R || NarrowedOps.contains(V))
4024 return V;
4025 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4026 // Narrow interleave group to wide load, as transformed VPlan will only
4027 // process one original iteration.
4028 auto *L = new VPWidenLoadRecipe(
4029 *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
4030 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4031 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4032 L->insertBefore(LoadGroup);
4033 NarrowedOps.insert(L);
4034 return L;
4035 }
4036
4037 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4038 assert(RepR->isSingleScalar() &&
4039 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4040 "must be a single scalar load");
4041 NarrowedOps.insert(RepR);
4042 return RepR;
4043 }
4044 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4045
4046 VPValue *PtrOp = WideLoad->getAddr();
4047 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4048 PtrOp = VecPtr->getOperand(0);
4049 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4050 // process one original iteration.
4051 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4052 /*IsUniform*/ true,
4053 /*Mask*/ nullptr, *WideLoad);
4054 N->insertBefore(WideLoad);
4055 NarrowedOps.insert(N);
4056 return N;
4057 };
4058
4059 // Narrow operation tree rooted at store groups.
4060 for (auto *StoreGroup : StoreGroups) {
4061 VPValue *Res = nullptr;
4062 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4063 if (isAlreadyNarrow(Member0)) {
4064 Res = Member0;
4065 } else if (auto *WideMember0 =
4067 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4068 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4069 Res = WideMember0;
4070 } else {
4071 Res = NarrowOp(Member0);
4072 }
4073
4074 auto *S = new VPWidenStoreRecipe(
4075 *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
4076 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4077 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4078 S->insertBefore(StoreGroup);
4079 StoreGroup->eraseFromParent();
4080 }
4081
4082 // Adjust induction to reflect that the transformed plan only processes one
4083 // original iteration.
4084 auto *CanIV = Plan.getCanonicalIV();
4085 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4086 VPBuilder PHBuilder(Plan.getVectorPreheader());
4087
4088 VPValue *UF = Plan.getOrAddLiveIn(
4089 ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
4090 if (VF.isScalable()) {
4091 VPValue *VScale = PHBuilder.createElementCount(
4092 CanIV->getScalarType(), ElementCount::getScalable(1));
4093 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4094 Inc->setOperand(1, VScaleUF);
4095 Plan.getVF().replaceAllUsesWith(VScale);
4096 } else {
4097 Inc->setOperand(1, UF);
4099 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
4100 }
4101 removeDeadRecipes(Plan);
4102}
4103
4104/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4105/// BranchOnCond recipe.
4107 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4108 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4109 auto *MiddleTerm =
4111 // Only add branch metadata if there is a (conditional) terminator.
4112 if (!MiddleTerm)
4113 return;
4114
4115 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4116 "must have a BranchOnCond");
4117 // Assume that `TripCount % VectorStep ` is equally distributed.
4118 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4119 if (VF.isScalable() && VScaleForTuning.has_value())
4120 VectorStep *= *VScaleForTuning;
4121 assert(VectorStep > 0 && "trip count should not be zero");
4122 MDBuilder MDB(Plan.getContext());
4123 MDNode *BranchWeights =
4124 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4125 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4126}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:381
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static Value * tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
Value * FoldGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, GEPNoWrapFlags NW) const override
Value * FoldBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS) const override
Value * FoldSelect(Value *C, Value *True, Value *False) const override
Value * FoldCast(Instruction::CastOps Op, Value *V, Type *DestTy) const override
Value * FoldCmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1580
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1077
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:99
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:104
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3467
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3754
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3829
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3781
iterator end()
Definition VPlan.h:3791
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3789
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3842
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:246
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:635
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:607
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:680
const VPRecipeBase & back() const
Definition VPlan.h:3803
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2393
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2427
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2417
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2433
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2413
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:217
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:238
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:157
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:176
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:195
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2924
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3410
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:3437
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:422
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:417
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:395
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:407
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3498
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:2969
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1967
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2015
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2004
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3907
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:3931
Class to record and manage LLVM IR flags.
Definition VPlan.h:600
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:979
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1056
@ FirstOrderRecurrenceSplice
Definition VPlan.h:985
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1009
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1006
@ CanonicalIVIncrementForPart
Definition VPlan.h:999
@ CalculateTripCountMinusVF
Definition VPlan.h:997
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2534
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2555
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2607
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2566
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3081
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:394
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition VPlan.h:477
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPBasicBlock * getParent()
Definition VPlan.h:415
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:482
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2802
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2656
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:3942
const VPBlockBase * getEntry() const
Definition VPlan.h:3978
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:3995
const VPBlockBase * getExiting() const
Definition VPlan.h:3990
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4003
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2846
bool isSingleScalar() const
Definition VPlan.h:2891
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2915
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3644
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:521
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:586
virtual VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:197
operand_range operands()
Definition VPlanValue.h:265
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:241
operand_iterator op_end()
Definition VPlanValue.h:263
operand_iterator op_begin()
Definition VPlanValue.h:261
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:236
void addOperand(VPValue *Operand)
Definition VPlanValue.h:230
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:174
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:184
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1416
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:169
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1420
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1833
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1877
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3539
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1483
A recipe for handling GEP instructions.
Definition VPlan.h:1769
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2032
PHINode * getPHINode() const
Definition VPlan.h:2074
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2060
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2077
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2107
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2188
A recipe for widening vector intrinsics.
Definition VPlan.h:1540
A common base class for widening memory operations.
Definition VPlan.h:3123
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3185
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3178
A recipe for widened phis.
Definition VPlan.h:2243
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1440
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4045
bool hasVF(ElementCount VF) const
Definition VPlan.h:4254
LLVMContext & getContext() const
Definition VPlan.h:4242
VPBasicBlock * getEntry()
Definition VPlan.h:4144
VPRegionBlock * createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="", bool IsReplicator=false)
Create a new VPRegionBlock with Entry, Exiting and Name.
Definition VPlan.h:4385
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4234
bool hasScalableVF() const
Definition VPlan.h:4255
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4240
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4237
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4206
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4311
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4227
unsigned getUF() const
Definition VPlan.h:4274
bool hasUF(unsigned UF) const
Definition VPlan.h:4272
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4196
void setVF(ElementCount VF)
Definition VPlan.h:4248
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4287
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1050
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4220
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4169
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4375
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4296
bool hasScalarVFOnly() const
Definition VPlan.h:4265
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4187
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4326
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition VPlan.h:4350
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4192
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4323
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4149
void setUF(unsigned UF)
Definition VPlan.h:4279
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4427
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount, Op0_t, Op1_t > m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::BranchOnCond, Op0_t > m_BranchOnCond(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition VPlanUtils.h:44
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool isHeaderMask(const VPValue *V, VPlan &Plan)
Return true if V is a header mask in Plan.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:646
auto cast_or_null(const Y &Val)
Definition Casting.h:720
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1180
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:390
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1652
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:337
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1956
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:591
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2090
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:77
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:836
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:465
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2286
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3245
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3205
A recipe for widening select instructions.
Definition VPlan.h:1723
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3327
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3285
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.