LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlanPtr &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan->getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
95 Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 VPIRMetadata(*Store), Ingredient.getDebugLoc());
102 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
103 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
104 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
105 if (VectorID == Intrinsic::not_intrinsic)
106 return false;
107 NewRecipe = new VPWidenIntrinsicRecipe(
108 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
109 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
110 CI->getDebugLoc());
111 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
112 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
113 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
114 NewRecipe = new VPWidenCastRecipe(
115 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
116 } else {
117 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
118 }
119 }
120
121 NewRecipe->insertBefore(&Ingredient);
122 if (NewRecipe->getNumDefinedValues() == 1)
123 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
124 else
125 assert(NewRecipe->getNumDefinedValues() == 0 &&
126 "Only recpies with zero or one defined values expected");
127 Ingredient.eraseFromParent();
128 }
129 }
130 return true;
131}
132
133static bool sinkScalarOperands(VPlan &Plan) {
134 auto Iter = vp_depth_first_deep(Plan.getEntry());
135 bool Changed = false;
136 // First, collect the operands of all recipes in replicate blocks as seeds for
137 // sinking.
140 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
141 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
142 continue;
143 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
144 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
145 continue;
146 for (auto &Recipe : *VPBB) {
147 for (VPValue *Op : Recipe.operands())
148 if (auto *Def =
149 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
150 WorkList.insert({VPBB, Def});
151 }
152 }
153
154 bool ScalarVFOnly = Plan.hasScalarVFOnly();
155 // Try to sink each replicate or scalar IV steps recipe in the worklist.
156 for (unsigned I = 0; I != WorkList.size(); ++I) {
157 VPBasicBlock *SinkTo;
158 VPSingleDefRecipe *SinkCandidate;
159 std::tie(SinkTo, SinkCandidate) = WorkList[I];
160 if (SinkCandidate->getParent() == SinkTo ||
161 SinkCandidate->mayHaveSideEffects() ||
162 SinkCandidate->mayReadOrWriteMemory())
163 continue;
164 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
165 if (!ScalarVFOnly && RepR->isSingleScalar())
166 continue;
167 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
168 continue;
169
170 bool NeedsDuplicating = false;
171 // All recipe users of the sink candidate must be in the same block SinkTo
172 // or all users outside of SinkTo must be uniform-after-vectorization (
173 // i.e., only first lane is used) . In the latter case, we need to duplicate
174 // SinkCandidate.
175 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
176 SinkCandidate](VPUser *U) {
177 auto *UI = cast<VPRecipeBase>(U);
178 if (UI->getParent() == SinkTo)
179 return true;
180 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
181 // We only know how to duplicate VPReplicateRecipes and
182 // VPScalarIVStepsRecipes for now.
183 return NeedsDuplicating &&
185 };
186 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
187 continue;
188
189 if (NeedsDuplicating) {
190 if (ScalarVFOnly)
191 continue;
192 VPSingleDefRecipe *Clone;
193 if (auto *SinkCandidateRepR =
194 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
195 // TODO: Handle converting to uniform recipes as separate transform,
196 // then cloning should be sufficient here.
197 Instruction *I = SinkCandidate->getUnderlyingInstr();
198 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
199 nullptr /*Mask*/, *SinkCandidateRepR);
200 // TODO: add ".cloned" suffix to name of Clone's VPValue.
201 } else {
202 Clone = SinkCandidate->clone();
203 }
204
205 Clone->insertBefore(SinkCandidate);
206 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
207 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
208 });
209 }
210 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
211 for (VPValue *Op : SinkCandidate->operands())
212 if (auto *Def =
213 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
214 WorkList.insert({SinkTo, Def});
215 Changed = true;
216 }
217 return Changed;
218}
219
220/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
221/// the mask.
223 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
224 if (!EntryBB || EntryBB->size() != 1 ||
225 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
226 return nullptr;
227
228 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
229}
230
231/// If \p R is a triangle region, return the 'then' block of the triangle.
233 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
234 if (EntryBB->getNumSuccessors() != 2)
235 return nullptr;
236
237 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
238 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
239 if (!Succ0 || !Succ1)
240 return nullptr;
241
242 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
243 return nullptr;
244 if (Succ0->getSingleSuccessor() == Succ1)
245 return Succ0;
246 if (Succ1->getSingleSuccessor() == Succ0)
247 return Succ1;
248 return nullptr;
249}
250
251// Merge replicate regions in their successor region, if a replicate region
252// is connected to a successor replicate region with the same predicate by a
253// single, empty VPBasicBlock.
255 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
256
257 // Collect replicate regions followed by an empty block, followed by another
258 // replicate region with matching masks to process front. This is to avoid
259 // iterator invalidation issues while merging regions.
262 vp_depth_first_deep(Plan.getEntry()))) {
263 if (!Region1->isReplicator())
264 continue;
265 auto *MiddleBasicBlock =
266 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
267 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
268 continue;
269
270 auto *Region2 =
271 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
272 if (!Region2 || !Region2->isReplicator())
273 continue;
274
275 VPValue *Mask1 = getPredicatedMask(Region1);
276 VPValue *Mask2 = getPredicatedMask(Region2);
277 if (!Mask1 || Mask1 != Mask2)
278 continue;
279
280 assert(Mask1 && Mask2 && "both region must have conditions");
281 WorkList.push_back(Region1);
282 }
283
284 // Move recipes from Region1 to its successor region, if both are triangles.
285 for (VPRegionBlock *Region1 : WorkList) {
286 if (TransformedRegions.contains(Region1))
287 continue;
288 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
289 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
290
291 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
292 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
293 if (!Then1 || !Then2)
294 continue;
295
296 // Note: No fusion-preventing memory dependencies are expected in either
297 // region. Such dependencies should be rejected during earlier dependence
298 // checks, which guarantee accesses can be re-ordered for vectorization.
299 //
300 // Move recipes to the successor region.
301 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
302 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
303
304 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
305 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
306
307 // Move VPPredInstPHIRecipes from the merge block to the successor region's
308 // merge block. Update all users inside the successor region to use the
309 // original values.
310 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
311 VPValue *PredInst1 =
312 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
313 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
314 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
315 return cast<VPRecipeBase>(&U)->getParent() == Then2;
316 });
317
318 // Remove phi recipes that are unused after merging the regions.
319 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
320 Phi1ToMove.eraseFromParent();
321 continue;
322 }
323 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
324 }
325
326 // Remove the dead recipes in Region1's entry block.
327 for (VPRecipeBase &R :
328 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
329 R.eraseFromParent();
330
331 // Finally, remove the first region.
332 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
333 VPBlockUtils::disconnectBlocks(Pred, Region1);
334 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
335 }
336 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
337 TransformedRegions.insert(Region1);
338 }
339
340 return !TransformedRegions.empty();
341}
342
344 VPlan &Plan) {
345 Instruction *Instr = PredRecipe->getUnderlyingInstr();
346 // Build the triangular if-then region.
347 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
348 assert(Instr->getParent() && "Predicated instruction not in any basic block");
349 auto *BlockInMask = PredRecipe->getMask();
350 auto *MaskDef = BlockInMask->getDefiningRecipe();
351 auto *BOMRecipe = new VPBranchOnMaskRecipe(
352 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
353 auto *Entry =
354 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
355
356 // Replace predicated replicate recipe with a replicate recipe without a
357 // mask but in the replicate region.
358 auto *RecipeWithoutMask = new VPReplicateRecipe(
359 PredRecipe->getUnderlyingInstr(),
360 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
361 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
362 auto *Pred =
363 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
364
365 VPPredInstPHIRecipe *PHIRecipe = nullptr;
366 if (PredRecipe->getNumUsers() != 0) {
367 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
368 RecipeWithoutMask->getDebugLoc());
369 PredRecipe->replaceAllUsesWith(PHIRecipe);
370 PHIRecipe->setOperand(0, RecipeWithoutMask);
371 }
372 PredRecipe->eraseFromParent();
373 auto *Exiting =
374 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
376 Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
377
378 // Note: first set Entry as region entry and then connect successors starting
379 // from it in order, to propagate the "parent" of each VPBasicBlock.
380 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
381 VPBlockUtils::connectBlocks(Pred, Exiting);
382
383 return Region;
384}
385
386static void addReplicateRegions(VPlan &Plan) {
389 vp_depth_first_deep(Plan.getEntry()))) {
390 for (VPRecipeBase &R : *VPBB)
391 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
392 if (RepR->isPredicated())
393 WorkList.push_back(RepR);
394 }
395 }
396
397 unsigned BBNum = 0;
398 for (VPReplicateRecipe *RepR : WorkList) {
399 VPBasicBlock *CurrentBlock = RepR->getParent();
400 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
401
402 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
403 SplitBlock->setName(
404 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
405 // Record predicated instructions for above packing optimizations.
407 Region->setParent(CurrentBlock->getParent());
409
410 VPRegionBlock *ParentRegion = Region->getParent();
411 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
412 ParentRegion->setExiting(SplitBlock);
413 }
414}
415
416/// Remove redundant VPBasicBlocks by merging them into their predecessor if
417/// the predecessor has a single successor.
421 vp_depth_first_deep(Plan.getEntry()))) {
422 // Don't fold the blocks in the skeleton of the Plan into their single
423 // predecessors for now.
424 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
425 if (!VPBB->getParent())
426 continue;
427 auto *PredVPBB =
428 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
429 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
430 isa<VPIRBasicBlock>(PredVPBB))
431 continue;
432 WorkList.push_back(VPBB);
433 }
434
435 for (VPBasicBlock *VPBB : WorkList) {
436 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
437 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
438 R.moveBefore(*PredVPBB, PredVPBB->end());
439 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
440 auto *ParentRegion = VPBB->getParent();
441 if (ParentRegion && ParentRegion->getExiting() == VPBB)
442 ParentRegion->setExiting(PredVPBB);
443 for (auto *Succ : to_vector(VPBB->successors())) {
445 VPBlockUtils::connectBlocks(PredVPBB, Succ);
446 }
447 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
448 }
449 return !WorkList.empty();
450}
451
453 // Convert masked VPReplicateRecipes to if-then region blocks.
455
456 bool ShouldSimplify = true;
457 while (ShouldSimplify) {
458 ShouldSimplify = sinkScalarOperands(Plan);
459 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
460 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
461 }
462}
463
464/// Remove redundant casts of inductions.
465///
466/// Such redundant casts are casts of induction variables that can be ignored,
467/// because we already proved that the casted phi is equal to the uncasted phi
468/// in the vectorized loop. There is no need to vectorize the cast - the same
469/// value can be used for both the phi and casts in the vector loop.
471 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
473 if (!IV || IV->getTruncInst())
474 continue;
475
476 // A sequence of IR Casts has potentially been recorded for IV, which
477 // *must be bypassed* when the IV is vectorized, because the vectorized IV
478 // will produce the desired casted value. This sequence forms a def-use
479 // chain and is provided in reverse order, ending with the cast that uses
480 // the IV phi. Search for the recipe of the last cast in the chain and
481 // replace it with the original IV. Note that only the final cast is
482 // expected to have users outside the cast-chain and the dead casts left
483 // over will be cleaned up later.
484 auto &Casts = IV->getInductionDescriptor().getCastInsts();
485 VPValue *FindMyCast = IV;
486 for (Instruction *IRCast : reverse(Casts)) {
487 VPSingleDefRecipe *FoundUserCast = nullptr;
488 for (auto *U : FindMyCast->users()) {
489 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
490 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
491 FoundUserCast = UserCast;
492 break;
493 }
494 }
495 FindMyCast = FoundUserCast;
496 }
497 FindMyCast->replaceAllUsesWith(IV);
498 }
499}
500
501/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
502/// recipe, if it exists.
504 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
505 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
506 for (VPUser *U : CanonicalIV->users()) {
508 if (WidenNewIV)
509 break;
510 }
511
512 if (!WidenNewIV)
513 return;
514
516 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
517 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
518
519 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
520 continue;
521
522 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
523 // everything WidenNewIV's users need. That is, WidenOriginalIV will
524 // generate a vector phi or all users of WidenNewIV demand the first lane
525 // only.
526 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
527 vputils::onlyFirstLaneUsed(WidenNewIV)) {
528 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
529 WidenNewIV->eraseFromParent();
530 return;
531 }
532 }
533}
534
535/// Returns true if \p R is dead and can be removed.
536static bool isDeadRecipe(VPRecipeBase &R) {
537 // Do remove conditional assume instructions as their conditions may be
538 // flattened.
539 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
540 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
542 if (IsConditionalAssume)
543 return true;
544
545 if (R.mayHaveSideEffects())
546 return false;
547
548 // Recipe is dead if no user keeps the recipe alive.
549 return all_of(R.definedValues(),
550 [](VPValue *V) { return V->getNumUsers() == 0; });
551}
552
555 vp_post_order_deep(Plan.getEntry()))) {
556 // The recipes in the block are processed in reverse order, to catch chains
557 // of dead recipes.
558 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
559 if (isDeadRecipe(R)) {
560 R.eraseFromParent();
561 continue;
562 }
563
564 // Check if R is a dead VPPhi <-> update cycle and remove it.
565 auto *PhiR = dyn_cast<VPPhi>(&R);
566 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
567 continue;
568 VPValue *Incoming = PhiR->getOperand(1);
569 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
570 Incoming->getNumUsers() != 1)
571 continue;
572 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
573 PhiR->eraseFromParent();
574 Incoming->getDefiningRecipe()->eraseFromParent();
575 }
576 }
577}
578
581 Instruction::BinaryOps InductionOpcode,
582 FPMathOperator *FPBinOp, Instruction *TruncI,
583 VPValue *StartV, VPValue *Step, DebugLoc DL,
584 VPBuilder &Builder) {
586 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
587 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
588 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
589
590 // Truncate base induction if needed.
591 VPTypeAnalysis TypeInfo(Plan);
592 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
593 if (TruncI) {
594 Type *TruncTy = TruncI->getType();
595 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
596 "Not truncating.");
597 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
598 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
599 ResultTy = TruncTy;
600 }
601
602 // Truncate step if needed.
603 Type *StepTy = TypeInfo.inferScalarType(Step);
604 if (ResultTy != StepTy) {
605 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
606 "Not truncating.");
607 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
608 auto *VecPreheader =
610 VPBuilder::InsertPointGuard Guard(Builder);
611 Builder.setInsertPoint(VecPreheader);
612 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
613 }
614 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
615 &Plan.getVF(), DL);
616}
617
620 for (unsigned I = 0; I != Users.size(); ++I) {
622 if (isa<VPHeaderPHIRecipe>(Cur))
623 continue;
624 for (VPValue *V : Cur->definedValues())
625 Users.insert_range(V->users());
626 }
627 return Users.takeVector();
628}
629
630/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
631/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
632/// VPWidenPointerInductionRecipe will generate vectors only. If some users
633/// require vectors while other require scalars, the scalar uses need to extract
634/// the scalars from the generated vectors (Note that this is different to how
635/// int/fp inductions are handled). Legalize extract-from-ends using uniform
636/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
637/// the correct end value is available. Also optimize
638/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
639/// providing them scalar steps built on the canonical scalar IV and update the
640/// original IV's users. This is an optional optimization to reduce the needs of
641/// vector extracts.
644 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
645 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
646 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
647 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
648 if (!PhiR)
649 continue;
650
651 // Try to narrow wide and replicating recipes to uniform recipes, based on
652 // VPlan analysis.
653 // TODO: Apply to all recipes in the future, to replace legacy uniformity
654 // analysis.
655 auto Users = collectUsersRecursively(PhiR);
656 for (VPUser *U : reverse(Users)) {
657 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
658 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
659 // Skip recipes that shouldn't be narrowed.
660 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
661 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
662 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
663 continue;
664
665 // Skip recipes that may have other lanes than their first used.
667 continue;
668
669 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
670 Def->operands(), /*IsUniform*/ true);
671 Clone->insertAfter(Def);
672 Def->replaceAllUsesWith(Clone);
673 }
674
675 // Replace wide pointer inductions which have only their scalars used by
676 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
677 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
678 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
679 continue;
680
681 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
682 VPValue *StartV =
683 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
684 VPValue *StepV = PtrIV->getOperand(1);
686 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
687 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
688
689 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
690 PtrIV->getDebugLoc(), "next.gep");
691
692 PtrIV->replaceAllUsesWith(PtrAdd);
693 continue;
694 }
695
696 // Replace widened induction with scalar steps for users that only use
697 // scalars.
698 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
699 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
700 return U->usesScalars(WideIV);
701 }))
702 continue;
703
704 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
706 Plan, ID.getKind(), ID.getInductionOpcode(),
707 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
708 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
709 WideIV->getDebugLoc(), Builder);
710
711 // Update scalar users of IV to use Step instead.
712 if (!HasOnlyVectorVFs)
713 WideIV->replaceAllUsesWith(Steps);
714 else
715 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
716 return U.usesScalars(WideIV);
717 });
718 }
719}
720
721/// Check if \p VPV is an untruncated wide induction, either before or after the
722/// increment. If so return the header IV (before the increment), otherwise
723/// return null.
725 ScalarEvolution &SE) {
726 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
727 if (WideIV) {
728 // VPV itself is a wide induction, separately compute the end value for exit
729 // users if it is not a truncated IV.
730 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
731 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
732 }
733
734 // Check if VPV is an optimizable induction increment.
735 VPRecipeBase *Def = VPV->getDefiningRecipe();
736 if (!Def || Def->getNumOperands() != 2)
737 return nullptr;
738 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
739 if (!WideIV)
740 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
741 if (!WideIV)
742 return nullptr;
743
744 auto IsWideIVInc = [&]() {
745 auto &ID = WideIV->getInductionDescriptor();
746
747 // Check if VPV increments the induction by the induction step.
748 VPValue *IVStep = WideIV->getStepValue();
749 switch (ID.getInductionOpcode()) {
750 case Instruction::Add:
751 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
752 case Instruction::FAdd:
754 m_Specific(IVStep)));
755 case Instruction::FSub:
756 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
757 m_Specific(IVStep)));
758 case Instruction::Sub: {
759 // IVStep will be the negated step of the subtraction. Check if Step == -1
760 // * IVStep.
761 VPValue *Step;
762 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
763 return false;
764 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
765 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
766 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
767 !isa<SCEVCouldNotCompute>(StepSCEV) &&
768 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
769 }
770 default:
771 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
772 match(VPV, m_GetElementPtr(m_Specific(WideIV),
773 m_Specific(WideIV->getStepValue())));
774 }
775 llvm_unreachable("should have been covered by switch above");
776 };
777 return IsWideIVInc() ? WideIV : nullptr;
778}
779
780/// Attempts to optimize the induction variable exit values for users in the
781/// early exit block.
783 VPTypeAnalysis &TypeInfo,
784 VPBlockBase *PredVPBB,
785 VPValue *Op,
786 ScalarEvolution &SE) {
787 VPValue *Incoming, *Mask;
790 m_VPValue(Mask)),
792 return nullptr;
793
794 auto *WideIV = getOptimizableIVOf(Incoming, SE);
795 if (!WideIV)
796 return nullptr;
797
798 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
799 if (WideIntOrFp && WideIntOrFp->getTruncInst())
800 return nullptr;
801
802 // Calculate the final index.
803 VPValue *EndValue = Plan.getCanonicalIV();
804 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
805 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
806
807 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
808 VPValue *FirstActiveLane =
809 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
810 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
811 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
812 FirstActiveLaneType, DL);
813 EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
814
815 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
816 // changed it means the exit is using the incremented value, so we need to
817 // add the step.
818 if (Incoming != WideIV) {
819 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
820 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
821 }
822
823 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
824 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
825 VPValue *Start = WideIV->getStartValue();
826 VPValue *Step = WideIV->getStepValue();
827 EndValue = B.createDerivedIV(
828 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
829 Start, EndValue, Step);
830 }
831
832 return EndValue;
833}
834
835/// Attempts to optimize the induction variable exit values for users in the
836/// exit block coming from the latch in the original scalar loop.
838 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
842 return nullptr;
843
844 auto *WideIV = getOptimizableIVOf(Incoming, SE);
845 if (!WideIV)
846 return nullptr;
847
848 VPValue *EndValue = EndValues.lookup(WideIV);
849 assert(EndValue && "end value must have been pre-computed");
850
851 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
852 // changed it means the exit is using the incremented value, so we don't
853 // need to subtract the step.
854 if (Incoming != WideIV)
855 return EndValue;
856
857 // Otherwise, subtract the step from the EndValue.
858 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
859 VPValue *Step = WideIV->getStepValue();
860 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
861 if (ScalarTy->isIntegerTy())
862 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
863 if (ScalarTy->isPointerTy()) {
864 Type *StepTy = TypeInfo.inferScalarType(Step);
865 auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
866 return B.createPtrAdd(EndValue,
867 B.createNaryOp(Instruction::Sub, {Zero, Step}),
868 DebugLoc::getUnknown(), "ind.escape");
869 }
870 if (ScalarTy->isFloatingPointTy()) {
871 const auto &ID = WideIV->getInductionDescriptor();
872 return B.createNaryOp(
873 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
874 ? Instruction::FSub
875 : Instruction::FAdd,
876 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
877 }
878 llvm_unreachable("all possible induction types must be handled");
879 return nullptr;
880}
881
883 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
884 ScalarEvolution &SE) {
885 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
886 VPTypeAnalysis TypeInfo(Plan);
887 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
888 for (VPRecipeBase &R : ExitVPBB->phis()) {
889 auto *ExitIRI = cast<VPIRPhi>(&R);
890
891 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
892 VPValue *Escape = nullptr;
893 if (PredVPBB == MiddleVPBB)
894 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
895 ExitIRI->getOperand(Idx),
896 EndValues, SE);
897 else
898 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
899 ExitIRI->getOperand(Idx), SE);
900 if (Escape)
901 ExitIRI->setOperand(Idx, Escape);
902 }
903 }
904 }
905}
906
907/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
908/// them with already existing recipes expanding the same SCEV expression.
911
912 for (VPRecipeBase &R :
914 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
915 if (!ExpR)
916 continue;
917
918 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
919 if (Inserted)
920 continue;
921 ExpR->replaceAllUsesWith(V->second);
922 ExpR->eraseFromParent();
923 }
924}
925
927 SmallVector<VPValue *> WorkList;
929 WorkList.push_back(V);
930
931 while (!WorkList.empty()) {
932 VPValue *Cur = WorkList.pop_back_val();
933 if (!Seen.insert(Cur).second)
934 continue;
936 if (!R)
937 continue;
938 if (!isDeadRecipe(*R))
939 continue;
940 WorkList.append(R->op_begin(), R->op_end());
941 R->eraseFromParent();
942 }
943}
944
945/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
946/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
947/// foldable live-ins.
948static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
950 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
952 for (VPValue *Op : Operands) {
953 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
954 return nullptr;
955 Ops.push_back(Op->getLiveInIRValue());
956 }
957
958 InstSimplifyFolder Folder(DL);
959 if (Instruction::isBinaryOp(Opcode))
960 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
961 Ops[1]);
962 if (Instruction::isCast(Opcode))
963 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
964 TypeInfo.inferScalarType(R.getVPSingleValue()));
965 switch (Opcode) {
967 return Folder.FoldSelect(Ops[0], Ops[1],
970 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
972 case Instruction::Select:
973 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
974 case Instruction::ICmp:
975 case Instruction::FCmp:
976 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
977 Ops[1]);
978 case Instruction::GetElementPtr: {
979 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
980 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
981 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
982 RFlags.getGEPNoWrapFlags());
983 }
986 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
987 Ops[1],
988 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
989 // An extract of a live-in is an extract of a broadcast, so return the
990 // broadcasted element.
991 case Instruction::ExtractElement:
992 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
993 return Ops[0];
994 }
995 return nullptr;
996}
997
998/// Try to simplify recipe \p R.
999static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
1000 VPlan *Plan = R.getParent()->getPlan();
1001
1002 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1003 if (!Def)
1004 return;
1005
1006 // Simplification of live-in IR values for SingleDef recipes using
1007 // InstSimplifyFolder.
1011 const DataLayout &DL =
1013 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1014 TypeInfo);
1015 if (V)
1016 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1017 return V;
1018 })
1019 .Default([](auto *) { return false; }))
1020 return;
1021
1022 // Fold PredPHI LiveIn -> LiveIn.
1023 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1024 VPValue *Op = PredPHI->getOperand(0);
1025 if (Op->isLiveIn())
1026 PredPHI->replaceAllUsesWith(Op);
1027 }
1028
1029 VPValue *A;
1030 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1031 Type *TruncTy = TypeInfo.inferScalarType(Def);
1032 Type *ATy = TypeInfo.inferScalarType(A);
1033 if (TruncTy == ATy) {
1034 Def->replaceAllUsesWith(A);
1035 } else {
1036 // Don't replace a scalarizing recipe with a widened cast.
1037 if (isa<VPReplicateRecipe>(Def))
1038 return;
1039 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1040
1041 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1042 ? Instruction::SExt
1043 : Instruction::ZExt;
1044 auto *VPC =
1045 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1046 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1047 // UnderlyingExt has distinct return type, used to retain legacy cost.
1048 VPC->setUnderlyingValue(UnderlyingExt);
1049 }
1050 VPC->insertBefore(&R);
1051 Def->replaceAllUsesWith(VPC);
1052 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1053 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1054 VPC->insertBefore(&R);
1055 Def->replaceAllUsesWith(VPC);
1056 }
1057 }
1058#ifndef NDEBUG
1059 // Verify that the cached type info is for both A and its users is still
1060 // accurate by comparing it to freshly computed types.
1061 VPTypeAnalysis TypeInfo2(*Plan);
1062 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1063 for (VPUser *U : A->users()) {
1064 auto *R = cast<VPRecipeBase>(U);
1065 for (VPValue *VPV : R->definedValues())
1066 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1067 }
1068#endif
1069 }
1070
1071 // Simplify (X && Y) || (X && !Y) -> X.
1072 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1073 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1074 // recipes to be visited during simplification.
1075 VPValue *X, *Y, *Z;
1076 if (match(Def,
1079 Def->replaceAllUsesWith(X);
1080 Def->eraseFromParent();
1081 return;
1082 }
1083
1084 // x | 1 -> 1
1085 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1086 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1087
1088 // x | 0 -> x
1089 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1090 return Def->replaceAllUsesWith(X);
1091
1092 // x & 0 -> 0
1093 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1094 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1095
1096 // x && false -> false
1097 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1098 return Def->replaceAllUsesWith(Def->getOperand(1));
1099
1100 // (x && y) || (x && z) -> x && (y || z)
1101 VPBuilder Builder(Def);
1104 // Simplify only if one of the operands has one use to avoid creating an
1105 // extra recipe.
1106 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1107 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1108 return Def->replaceAllUsesWith(
1109 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1110
1111 // x && !x -> 0
1113 return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
1115
1116 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1117 return Def->replaceAllUsesWith(X);
1118
1119 // select !c, x, y -> select c, y, x
1120 VPValue *C;
1121 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1122 Def->setOperand(0, C);
1123 Def->setOperand(1, Y);
1124 Def->setOperand(2, X);
1125 return;
1126 }
1127
1128 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1129 // tail folding it is likely that x is a header mask and can be simplified
1130 // further.
1132 m_VPValue(Z))) &&
1133 X->hasMoreThanOneUniqueUser())
1134 return Def->replaceAllUsesWith(
1135 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1136
1137 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1138 return Def->replaceAllUsesWith(A);
1139
1140 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1141 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1142 : R.getOperand(0));
1143
1144 if (match(Def, m_Not(m_VPValue(A)))) {
1145 if (match(A, m_Not(m_VPValue(A))))
1146 return Def->replaceAllUsesWith(A);
1147
1148 // Try to fold Not into compares by adjusting the predicate in-place.
1149 CmpPredicate Pred;
1150 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1151 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1152 if (all_of(Cmp->users(),
1154 m_Not(m_Specific(Cmp)),
1155 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1156 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1157 for (VPUser *U : to_vector(Cmp->users())) {
1158 auto *R = cast<VPSingleDefRecipe>(U);
1159 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1160 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1161 R->setOperand(1, Y);
1162 R->setOperand(2, X);
1163 } else {
1164 // not (cmp pred) -> cmp inv_pred
1165 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1166 R->replaceAllUsesWith(Cmp);
1167 }
1168 }
1169 // If Cmp doesn't have a debug location, use the one from the negation,
1170 // to preserve the location.
1171 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1172 Cmp->setDebugLoc(R.getDebugLoc());
1173 }
1174 }
1175 }
1176
1177 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1178 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1179 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1180 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1181 TypeInfo.inferScalarType(Def))
1182 return Def->replaceAllUsesWith(Def->getOperand(1));
1183
1185 m_One()))) {
1186 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1187 if (TypeInfo.inferScalarType(X) != WideStepTy)
1188 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1189 Def->replaceAllUsesWith(X);
1190 return;
1191 }
1192
1193 // For i1 vp.merges produced by AnyOf reductions:
1194 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1196 m_VPValue(X), m_VPValue())) &&
1198 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1199 Def->setOperand(1, Def->getOperand(0));
1200 Def->setOperand(0, Y);
1201 return;
1202 }
1203
1204 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1205 if (Phi->getOperand(0) == Phi->getOperand(1))
1206 Def->replaceAllUsesWith(Phi->getOperand(0));
1207 return;
1208 }
1209
1210 // Look through ExtractLastElement (BuildVector ....).
1212 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1213 Def->replaceAllUsesWith(
1214 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1215 return;
1216 }
1217
1218 // Look through ExtractPenultimateElement (BuildVector ....).
1220 m_BuildVector()))) {
1221 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1222 Def->replaceAllUsesWith(
1223 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1224 return;
1225 }
1226
1227 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1228 if (Phi->getNumOperands() == 1)
1229 Phi->replaceAllUsesWith(Phi->getOperand(0));
1230 return;
1231 }
1232
1233 // Some simplifications can only be applied after unrolling. Perform them
1234 // below.
1235 if (!Plan->isUnrolled())
1236 return;
1237
1238 // VPVectorPointer for part 0 can be replaced by their start pointer.
1239 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1240 if (VecPtr->isFirstPart()) {
1241 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1242 return;
1243 }
1244 }
1245
1246 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1247 // the first lane is demanded.
1248 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1249 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1250 Steps->replaceAllUsesWith(Steps->getOperand(0));
1251 return;
1252 }
1253 }
1254 // Simplify redundant ReductionStartVector recipes after unrolling.
1255 VPValue *StartV;
1257 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1258 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1259 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1260 return PhiR && PhiR->isInLoop();
1261 });
1262 return;
1263 }
1264
1266 Def->replaceAllUsesWith(A);
1267 return;
1268 }
1269
1270 if (match(Def,
1274 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1275 all_of(A->users(),
1276 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1277 return Def->replaceAllUsesWith(A);
1278 }
1279}
1280
1283 Plan.getEntry());
1284 VPTypeAnalysis TypeInfo(Plan);
1286 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1287 simplifyRecipe(R, TypeInfo);
1288 }
1289 }
1290}
1291
1293 if (Plan.hasScalarVFOnly())
1294 return;
1295
1296 // Try to narrow wide and replicating recipes to single scalar recipes,
1297 // based on VPlan analysis. Only process blocks in the loop region for now,
1298 // without traversing into nested regions, as recipes in replicate regions
1299 // cannot be converted yet.
1302 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1304 continue;
1305 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1306 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1307 continue;
1308
1309 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1310 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1311 vputils::isSingleScalar(RepR->getOperand(1))) {
1312 auto *Clone = new VPReplicateRecipe(
1313 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1314 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1315 Clone->insertBefore(RepOrWidenR);
1317 {Clone->getOperand(0)});
1318 Ext->insertBefore(Clone);
1319 Clone->setOperand(0, Ext);
1320 RepR->eraseFromParent();
1321 continue;
1322 }
1323
1324 // Skip recipes that aren't single scalars or don't have only their
1325 // scalar results used. In the latter case, we would introduce extra
1326 // broadcasts.
1327 if (!vputils::isSingleScalar(RepOrWidenR) ||
1328 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1329 return U->usesScalars(RepOrWidenR) ||
1330 match(cast<VPRecipeBase>(U),
1331 m_ExtractLastElement(m_VPValue()));
1332 }))
1333 continue;
1334
1335 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1336 RepOrWidenR->operands(),
1337 true /*IsSingleScalar*/);
1338 Clone->insertBefore(RepOrWidenR);
1339 RepOrWidenR->replaceAllUsesWith(Clone);
1340 }
1341 }
1342}
1343
1344/// Try to see if all of \p Blend's masks share a common value logically and'ed
1345/// and remove it from the masks.
1347 if (Blend->isNormalized())
1348 return;
1349 VPValue *CommonEdgeMask;
1350 if (!match(Blend->getMask(0),
1351 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1352 return;
1353 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1354 if (!match(Blend->getMask(I),
1355 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1356 return;
1357 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1358 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1359}
1360
1361/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1362/// to make sure the masks are simplified.
1363static void simplifyBlends(VPlan &Plan) {
1366 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1367 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1368 if (!Blend)
1369 continue;
1370
1371 removeCommonBlendMask(Blend);
1372
1373 // Try to remove redundant blend recipes.
1374 SmallPtrSet<VPValue *, 4> UniqueValues;
1375 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1376 UniqueValues.insert(Blend->getIncomingValue(0));
1377 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1378 if (!match(Blend->getMask(I), m_False()))
1379 UniqueValues.insert(Blend->getIncomingValue(I));
1380
1381 if (UniqueValues.size() == 1) {
1382 Blend->replaceAllUsesWith(*UniqueValues.begin());
1383 Blend->eraseFromParent();
1384 continue;
1385 }
1386
1387 if (Blend->isNormalized())
1388 continue;
1389
1390 // Normalize the blend so its first incoming value is used as the initial
1391 // value with the others blended into it.
1392
1393 unsigned StartIndex = 0;
1394 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1395 // If a value's mask is used only by the blend then is can be deadcoded.
1396 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1397 // that's used by multiple blends where it can be removed from them all.
1398 VPValue *Mask = Blend->getMask(I);
1399 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1400 StartIndex = I;
1401 break;
1402 }
1403 }
1404
1405 SmallVector<VPValue *, 4> OperandsWithMask;
1406 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1407
1408 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1409 if (I == StartIndex)
1410 continue;
1411 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1412 OperandsWithMask.push_back(Blend->getMask(I));
1413 }
1414
1415 auto *NewBlend =
1416 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1417 OperandsWithMask, Blend->getDebugLoc());
1418 NewBlend->insertBefore(&R);
1419
1420 VPValue *DeadMask = Blend->getMask(StartIndex);
1421 Blend->replaceAllUsesWith(NewBlend);
1422 Blend->eraseFromParent();
1424
1425 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1426 VPValue *NewMask;
1427 if (NewBlend->getNumOperands() == 3 &&
1428 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1429 VPValue *Inc0 = NewBlend->getOperand(0);
1430 VPValue *Inc1 = NewBlend->getOperand(1);
1431 VPValue *OldMask = NewBlend->getOperand(2);
1432 NewBlend->setOperand(0, Inc1);
1433 NewBlend->setOperand(1, Inc0);
1434 NewBlend->setOperand(2, NewMask);
1435 if (OldMask->getNumUsers() == 0)
1436 cast<VPInstruction>(OldMask)->eraseFromParent();
1437 }
1438 }
1439 }
1440}
1441
1442/// Optimize the width of vector induction variables in \p Plan based on a known
1443/// constant Trip Count, \p BestVF and \p BestUF.
1445 ElementCount BestVF,
1446 unsigned BestUF) {
1447 // Only proceed if we have not completely removed the vector region.
1448 if (!Plan.getVectorLoopRegion())
1449 return false;
1450
1451 if (!Plan.getTripCount()->isLiveIn())
1452 return false;
1455 if (!TC || !BestVF.isFixed())
1456 return false;
1457
1458 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1459 // and UF. Returns at least 8.
1460 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1461 APInt AlignedTC =
1464 APInt MaxVal = AlignedTC - 1;
1465 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1466 };
1467 unsigned NewBitWidth =
1468 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1469
1470 LLVMContext &Ctx = Plan.getContext();
1471 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1472
1473 bool MadeChange = false;
1474
1475 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1476 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1477 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1478
1479 // Currently only handle canonical IVs as it is trivial to replace the start
1480 // and stop values, and we currently only perform the optimization when the
1481 // IV has a single use.
1482 if (!WideIV || !WideIV->isCanonical() ||
1483 WideIV->hasMoreThanOneUniqueUser() ||
1484 NewIVTy == WideIV->getScalarType())
1485 continue;
1486
1487 // Currently only handle cases where the single user is a header-mask
1488 // comparison with the backedge-taken-count.
1489 if (!match(*WideIV->user_begin(),
1490 m_ICmp(m_Specific(WideIV),
1493 continue;
1494
1495 // Update IV operands and comparison bound to use new narrower type.
1496 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1497 WideIV->setStartValue(NewStart);
1498 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1499 WideIV->setStepValue(NewStep);
1500
1501 auto *NewBTC = new VPWidenCastRecipe(
1502 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1503 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1504 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1505 Cmp->setOperand(1, NewBTC);
1506
1507 MadeChange = true;
1508 }
1509
1510 return MadeChange;
1511}
1512
1513/// Return true if \p Cond is known to be true for given \p BestVF and \p
1514/// BestUF.
1516 ElementCount BestVF, unsigned BestUF,
1517 ScalarEvolution &SE) {
1519 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1520 &SE](VPValue *C) {
1521 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1522 });
1523
1524 auto *CanIV = Plan.getCanonicalIV();
1526 m_Specific(CanIV->getBackedgeValue()),
1527 m_Specific(&Plan.getVectorTripCount()))))
1528 return false;
1529
1530 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1531 // count is not conveniently available as SCEV so far, so we compare directly
1532 // against the original trip count. This is stricter than necessary, as we
1533 // will only return true if the trip count == vector trip count.
1534 const SCEV *VectorTripCount =
1536 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1537 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1538 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1539 "Trip count SCEV must be computable");
1540 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1541 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1542 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1543}
1544
1545/// Try to replace multiple active lane masks used for control flow with
1546/// a single, wide active lane mask instruction followed by multiple
1547/// extract subvector intrinsics. This applies to the active lane mask
1548/// instructions both in the loop and in the preheader.
1549/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1550/// new extracts from the first active lane mask, which has it's last
1551/// operand (multiplier) set to UF.
1553 unsigned UF) {
1554 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1555 return false;
1556
1557 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1558 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1559 auto *Term = &ExitingVPBB->back();
1560
1561 using namespace llvm::VPlanPatternMatch;
1563 m_VPValue(), m_VPValue(), m_VPValue())))))
1564 return false;
1565
1566 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1567 LLVMContext &Ctx = Plan.getContext();
1568
1569 auto ExtractFromALM = [&](VPInstruction *ALM,
1570 SmallVectorImpl<VPValue *> &Extracts) {
1571 DebugLoc DL = ALM->getDebugLoc();
1572 for (unsigned Part = 0; Part < UF; ++Part) {
1574 Ops.append({ALM, Plan.getOrAddLiveIn(
1575 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1576 VF.getKnownMinValue() * Part))});
1577 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1579 Extracts[Part] = Ext;
1580 Ext->insertAfter(ALM);
1581 }
1582 };
1583
1584 // Create a list of each active lane mask phi, ordered by unroll part.
1586 for (VPRecipeBase &R : Header->phis()) {
1588 if (!Phi)
1589 continue;
1590 VPValue *Index = nullptr;
1591 match(Phi->getBackedgeValue(),
1593 assert(Index && "Expected index from ActiveLaneMask instruction");
1594
1595 uint64_t Part;
1596 if (match(Index,
1598 m_VPValue(), m_ConstantInt(Part))))
1599 Phis[Part] = Phi;
1600 else
1601 // Anything other than a CanonicalIVIncrementForPart is part 0
1602 Phis[0] = Phi;
1603 }
1604
1605 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1606 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1607
1608 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1609 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1610
1611 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1612 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1613 "Expected incoming values of Phi to be ActiveLaneMasks");
1614
1615 // When using wide lane masks, the return type of the get.active.lane.mask
1616 // intrinsic is VF x UF (last operand).
1617 VPValue *ALMMultiplier =
1618 Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
1619 EntryALM->setOperand(2, ALMMultiplier);
1620 LoopALM->setOperand(2, ALMMultiplier);
1621
1622 // Create UF x extract vectors and insert into preheader.
1623 SmallVector<VPValue *> EntryExtracts(UF);
1624 ExtractFromALM(EntryALM, EntryExtracts);
1625
1626 // Create UF x extract vectors and insert before the loop compare & branch,
1627 // updating the compare to use the first extract.
1628 SmallVector<VPValue *> LoopExtracts(UF);
1629 ExtractFromALM(LoopALM, LoopExtracts);
1630 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1631 Not->setOperand(0, LoopExtracts[0]);
1632
1633 // Update the incoming values of active lane mask phis.
1634 for (unsigned Part = 0; Part < UF; ++Part) {
1635 Phis[Part]->setStartValue(EntryExtracts[Part]);
1636 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1637 }
1638
1639 return true;
1640}
1641
1642/// Try to simplify the branch condition of \p Plan. This may restrict the
1643/// resulting plan to \p BestVF and \p BestUF.
1645 unsigned BestUF,
1647 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1648 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1649 auto *Term = &ExitingVPBB->back();
1650 VPValue *Cond;
1651 ScalarEvolution &SE = *PSE.getSE();
1652 if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1654 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1655 // Try to simplify the branch condition if TC <= VF * UF when the latch
1656 // terminator is BranchOnCount or BranchOnCond where the input is
1657 // Not(ActiveLaneMask).
1658 const SCEV *TripCount =
1660 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1661 "Trip count SCEV must be computable");
1662 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1663 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1664 if (TripCount->isZero() ||
1665 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1666 return false;
1667 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1668 // For BranchOnCond, check if we can prove the condition to be true using VF
1669 // and UF.
1670 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1671 return false;
1672 } else {
1673 return false;
1674 }
1675
1676 // The vector loop region only executes once. If possible, completely remove
1677 // the region, otherwise replace the terminator controlling the latch with
1678 // (BranchOnCond true).
1679 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1680 // support for other non-canonical widen induction recipes (e.g.,
1681 // VPWidenPointerInductionRecipe).
1682 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1683 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1684 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1685 return R->isCanonical();
1686 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1687 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1688 })) {
1689 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1690 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1691 VPBuilder Builder(Plan.getVectorPreheader());
1692 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1693 R->getScalarType());
1694 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1695 HeaderR.eraseFromParent();
1696 continue;
1697 }
1698 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1699 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1700 HeaderR.eraseFromParent();
1701 }
1702
1703 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1704 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1705 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1706 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1707
1708 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1709 B->setParent(nullptr);
1710
1711 VPBlockUtils::connectBlocks(Preheader, Header);
1712 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1714 } else {
1715 // The vector region contains header phis for which we cannot remove the
1716 // loop region yet.
1717 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1718 Term->getDebugLoc());
1719 ExitingVPBB->appendRecipe(BOC);
1720 }
1721
1722 Term->eraseFromParent();
1723
1724 return true;
1725}
1726
1728 unsigned BestUF,
1730 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1731 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1732
1733 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1734 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1735 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1736
1737 if (MadeChange) {
1738 Plan.setVF(BestVF);
1739 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1740 }
1741 // TODO: Further simplifications are possible
1742 // 1. Replace inductions with constants.
1743 // 2. Replace vector loop region with VPBasicBlock.
1744}
1745
1746/// Sink users of \p FOR after the recipe defining the previous value \p
1747/// Previous of the recurrence. \returns true if all users of \p FOR could be
1748/// re-arranged as needed or false if it is not possible.
1749static bool
1751 VPRecipeBase *Previous,
1752 VPDominatorTree &VPDT) {
1753 // Collect recipes that need sinking.
1756 Seen.insert(Previous);
1757 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1758 // The previous value must not depend on the users of the recurrence phi. In
1759 // that case, FOR is not a fixed order recurrence.
1760 if (SinkCandidate == Previous)
1761 return false;
1762
1763 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1764 !Seen.insert(SinkCandidate).second ||
1765 VPDT.properlyDominates(Previous, SinkCandidate))
1766 return true;
1767
1768 if (SinkCandidate->mayHaveSideEffects())
1769 return false;
1770
1771 WorkList.push_back(SinkCandidate);
1772 return true;
1773 };
1774
1775 // Recursively sink users of FOR after Previous.
1776 WorkList.push_back(FOR);
1777 for (unsigned I = 0; I != WorkList.size(); ++I) {
1778 VPRecipeBase *Current = WorkList[I];
1779 assert(Current->getNumDefinedValues() == 1 &&
1780 "only recipes with a single defined value expected");
1781
1782 for (VPUser *User : Current->getVPSingleValue()->users()) {
1783 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1784 return false;
1785 }
1786 }
1787
1788 // Keep recipes to sink ordered by dominance so earlier instructions are
1789 // processed first.
1790 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1791 return VPDT.properlyDominates(A, B);
1792 });
1793
1794 for (VPRecipeBase *SinkCandidate : WorkList) {
1795 if (SinkCandidate == FOR)
1796 continue;
1797
1798 SinkCandidate->moveAfter(Previous);
1799 Previous = SinkCandidate;
1800 }
1801 return true;
1802}
1803
1804/// Try to hoist \p Previous and its operands before all users of \p FOR.
1806 VPRecipeBase *Previous,
1807 VPDominatorTree &VPDT) {
1808 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1809 return false;
1810
1811 // Collect recipes that need hoisting.
1812 SmallVector<VPRecipeBase *> HoistCandidates;
1814 VPRecipeBase *HoistPoint = nullptr;
1815 // Find the closest hoist point by looking at all users of FOR and selecting
1816 // the recipe dominating all other users.
1817 for (VPUser *U : FOR->users()) {
1818 auto *R = cast<VPRecipeBase>(U);
1819 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1820 HoistPoint = R;
1821 }
1822 assert(all_of(FOR->users(),
1823 [&VPDT, HoistPoint](VPUser *U) {
1824 auto *R = cast<VPRecipeBase>(U);
1825 return HoistPoint == R ||
1826 VPDT.properlyDominates(HoistPoint, R);
1827 }) &&
1828 "HoistPoint must dominate all users of FOR");
1829
1830 auto NeedsHoisting = [HoistPoint, &VPDT,
1831 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1832 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1833 if (!HoistCandidate)
1834 return nullptr;
1835 VPRegionBlock *EnclosingLoopRegion =
1836 HoistCandidate->getParent()->getEnclosingLoopRegion();
1837 assert((!HoistCandidate->getParent()->getParent() ||
1838 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1839 "CFG in VPlan should still be flat, without replicate regions");
1840 // Hoist candidate was already visited, no need to hoist.
1841 if (!Visited.insert(HoistCandidate).second)
1842 return nullptr;
1843
1844 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1845 // hoisting.
1846 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1847 return nullptr;
1848
1849 // If we reached a recipe that dominates HoistPoint, we don't need to
1850 // hoist the recipe.
1851 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1852 return nullptr;
1853 return HoistCandidate;
1854 };
1855 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1856 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1857 // associated dependencies.
1858 return !HoistCandidate->mayHaveSideEffects();
1859 };
1860
1861 if (!NeedsHoisting(Previous->getVPSingleValue()))
1862 return true;
1863
1864 // Recursively try to hoist Previous and its operands before all users of FOR.
1865 HoistCandidates.push_back(Previous);
1866
1867 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1868 VPRecipeBase *Current = HoistCandidates[I];
1869 assert(Current->getNumDefinedValues() == 1 &&
1870 "only recipes with a single defined value expected");
1871 if (!CanHoist(Current))
1872 return false;
1873
1874 for (VPValue *Op : Current->operands()) {
1875 // If we reach FOR, it means the original Previous depends on some other
1876 // recurrence that in turn depends on FOR. If that is the case, we would
1877 // also need to hoist recipes involving the other FOR, which may break
1878 // dependencies.
1879 if (Op == FOR)
1880 return false;
1881
1882 if (auto *R = NeedsHoisting(Op))
1883 HoistCandidates.push_back(R);
1884 }
1885 }
1886
1887 // Order recipes to hoist by dominance so earlier instructions are processed
1888 // first.
1889 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1890 return VPDT.properlyDominates(A, B);
1891 });
1892
1893 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1894 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1895 HoistPoint->getIterator());
1896 }
1897
1898 return true;
1899}
1900
1902 VPBuilder &LoopBuilder) {
1903 VPDominatorTree VPDT;
1904 VPDT.recalculate(Plan);
1905
1907 for (VPRecipeBase &R :
1910 RecurrencePhis.push_back(FOR);
1911
1912 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1914 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1915 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1916 // to terminate.
1917 while (auto *PrevPhi =
1919 assert(PrevPhi->getParent() == FOR->getParent());
1920 assert(SeenPhis.insert(PrevPhi).second);
1921 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1922 }
1923
1924 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1925 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1926 return false;
1927
1928 // Introduce a recipe to combine the incoming and previous values of a
1929 // fixed-order recurrence.
1930 VPBasicBlock *InsertBlock = Previous->getParent();
1931 if (isa<VPHeaderPHIRecipe>(Previous))
1932 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1933 else
1934 LoopBuilder.setInsertPoint(InsertBlock,
1935 std::next(Previous->getIterator()));
1936
1937 auto *RecurSplice =
1939 {FOR, FOR->getBackedgeValue()});
1940
1941 FOR->replaceAllUsesWith(RecurSplice);
1942 // Set the first operand of RecurSplice to FOR again, after replacing
1943 // all users.
1944 RecurSplice->setOperand(0, FOR);
1945 }
1946 return true;
1947}
1948
1950 for (VPRecipeBase &R :
1952 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1953 if (!PhiR)
1954 continue;
1955 RecurKind RK = PhiR->getRecurrenceKind();
1956 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
1958 continue;
1959
1960 for (VPUser *U : collectUsersRecursively(PhiR))
1961 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1962 RecWithFlags->dropPoisonGeneratingFlags();
1963 }
1964 }
1965}
1966
1967namespace {
1968struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1969 static bool isSentinel(const VPSingleDefRecipe *Def) {
1970 return Def == getEmptyKey() || Def == getTombstoneKey();
1971 }
1972
1973 /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1974 /// Returns an optional pair, where the first element indicates whether it is
1975 /// an intrinsic ID.
1976 static std::optional<std::pair<bool, unsigned>>
1977 getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1978 return TypeSwitch<const VPSingleDefRecipe *,
1979 std::optional<std::pair<bool, unsigned>>>(R)
1982 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1983 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1984 return std::make_pair(true, I->getVectorIntrinsicID());
1985 })
1986 .Default([](auto *) { return std::nullopt; });
1987 }
1988
1989 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
1990 /// return that source element type.
1991 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
1992 // All VPInstructions that lower to GEPs must have the i8 source element
1993 // type (as they are PtrAdds), so we omit it.
1994 return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)
1995 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
1996 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
1997 return GEP->getSourceElementType();
1998 return nullptr;
1999 })
2000 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2001 [](auto *I) { return I->getSourceElementType(); })
2002 .Default([](auto *) { return nullptr; });
2003 }
2004
2005 /// Returns true if recipe \p Def can be safely handed for CSE.
2006 static bool canHandle(const VPSingleDefRecipe *Def) {
2007 // We can extend the list of handled recipes in the future,
2008 // provided we account for the data embedded in them while checking for
2009 // equality or hashing. We assign VPVectorEndPointerRecipe the GEP opcode,
2010 // as it is essentially a GEP with different semantics.
2011 auto C = isa<VPVectorPointerRecipe>(Def)
2012 ? std::make_pair(false, Instruction::GetElementPtr)
2013 : getOpcodeOrIntrinsicID(Def);
2014
2015 // The issue with (Insert|Extract)Value is that the index of the
2016 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2017 // VPlan.
2018 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2019 C->second == Instruction::ExtractValue)))
2020 return false;
2021
2022 // During CSE, we can only handle recipes that don't read from memory: if
2023 // they read from memory, there could be an intervening write to memory
2024 // before the next instance is CSE'd, leading to an incorrect result.
2025 return !Def->mayReadFromMemory();
2026 }
2027
2028 /// Hash the underlying data of \p Def.
2029 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2030 const VPlan *Plan = Def->getParent()->getPlan();
2031 VPTypeAnalysis TypeInfo(*Plan);
2032 hash_code Result = hash_combine(
2033 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2034 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2036 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2037 if (RFlags->hasPredicate())
2038 return hash_combine(Result, RFlags->getPredicate());
2039 return Result;
2040 }
2041
2042 /// Check equality of underlying data of \p L and \p R.
2043 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2044 if (isSentinel(L) || isSentinel(R))
2045 return L == R;
2046 if (L->getVPDefID() != R->getVPDefID() ||
2047 getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
2048 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2050 !equal(L->operands(), R->operands()))
2051 return false;
2052 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2053 if (LFlags->hasPredicate() &&
2054 LFlags->getPredicate() !=
2055 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2056 return false;
2057 const VPlan *Plan = L->getParent()->getPlan();
2058 VPTypeAnalysis TypeInfo(*Plan);
2059 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2060 }
2061};
2062} // end anonymous namespace
2063
2064/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2065/// Plan.
2067 VPDominatorTree VPDT(Plan);
2069
2071 vp_depth_first_deep(Plan.getEntry()))) {
2072 for (VPRecipeBase &R : *VPBB) {
2073 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2074 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2075 continue;
2076 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2077 // V must dominate Def for a valid replacement.
2078 if (!VPDT.dominates(V->getParent(), VPBB))
2079 continue;
2080 // Only keep flags present on both V and Def.
2081 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2082 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2083 Def->replaceAllUsesWith(V);
2084 continue;
2085 }
2086 CSEMap[Def] = Def;
2087 }
2088 }
2089}
2090
2091/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2092static void licm(VPlan &Plan) {
2093 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2094
2095 // Return true if we do not know how to (mechanically) hoist a given recipe
2096 // out of a loop region. Does not address legality concerns such as aliasing
2097 // or speculation safety.
2098 auto CannotHoistRecipe = [](VPRecipeBase &R) {
2099 // Allocas cannot be hoisted.
2100 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2101 return RepR && RepR->getOpcode() == Instruction::Alloca;
2102 };
2103
2104 // Hoist any loop invariant recipes from the vector loop region to the
2105 // preheader. Preform a shallow traversal of the vector loop region, to
2106 // exclude recipes in replicate regions.
2107 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2109 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2110 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2111 if (CannotHoistRecipe(R))
2112 continue;
2113 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
2114 // their memory location is not modified in the vector loop.
2115 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
2116 any_of(R.operands(), [](VPValue *Op) {
2117 return !Op->isDefinedOutsideLoopRegions();
2118 }))
2119 continue;
2120 R.moveBefore(*Preheader, Preheader->end());
2121 }
2122 }
2123}
2124
2126 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2127 // Keep track of created truncates, so they can be re-used. Note that we
2128 // cannot use RAUW after creating a new truncate, as this would could make
2129 // other uses have different types for their operands, making them invalidly
2130 // typed.
2132 VPTypeAnalysis TypeInfo(Plan);
2133 VPBasicBlock *PH = Plan.getVectorPreheader();
2136 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2139 &R))
2140 continue;
2141
2142 VPValue *ResultVPV = R.getVPSingleValue();
2143 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2144 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2145 if (!NewResSizeInBits)
2146 continue;
2147
2148 // If the value wasn't vectorized, we must maintain the original scalar
2149 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2150 // skip casts which do not need to be handled explicitly here, as
2151 // redundant casts will be removed during recipe simplification.
2153 continue;
2154
2155 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2156 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2157 assert(OldResTy->isIntegerTy() && "only integer types supported");
2158 (void)OldResSizeInBits;
2159
2160 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2161
2162 // Any wrapping introduced by shrinking this operation shouldn't be
2163 // considered undefined behavior. So, we can't unconditionally copy
2164 // arithmetic wrapping flags to VPW.
2165 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2166 VPW->dropPoisonGeneratingFlags();
2167
2168 if (OldResSizeInBits != NewResSizeInBits &&
2169 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2170 // Extend result to original width.
2171 auto *Ext =
2172 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2173 Ext->insertAfter(&R);
2174 ResultVPV->replaceAllUsesWith(Ext);
2175 Ext->setOperand(0, ResultVPV);
2176 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2177 } else {
2178 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2179 "Only ICmps should not need extending the result.");
2180 }
2181
2182 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2184 continue;
2185
2186 // Shrink operands by introducing truncates as needed.
2187 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2188 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2189 auto *Op = R.getOperand(Idx);
2190 unsigned OpSizeInBits =
2192 if (OpSizeInBits == NewResSizeInBits)
2193 continue;
2194 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2195 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2196 VPWidenCastRecipe *NewOp =
2197 IterIsEmpty
2198 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy,
2199 VPIRFlags::TruncFlagsTy(false, false))
2200 : ProcessedIter->second;
2201 R.setOperand(Idx, NewOp);
2202 if (!IterIsEmpty)
2203 continue;
2204 ProcessedIter->second = NewOp;
2205 if (!Op->isLiveIn()) {
2206 NewOp->insertBefore(&R);
2207 } else {
2208 PH->appendRecipe(NewOp);
2209 }
2210 }
2211
2212 }
2213 }
2214}
2215
2219 VPValue *Cond;
2220 // Skip blocks that are not terminated by BranchOnCond.
2221 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2222 continue;
2223
2224 assert(VPBB->getNumSuccessors() == 2 &&
2225 "Two successors expected for BranchOnCond");
2226 unsigned RemovedIdx;
2227 if (match(Cond, m_True()))
2228 RemovedIdx = 1;
2229 else if (match(Cond, m_False()))
2230 RemovedIdx = 0;
2231 else
2232 continue;
2233
2234 VPBasicBlock *RemovedSucc =
2235 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2236 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2237 "There must be a single edge between VPBB and its successor");
2238 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2239 // these recipes.
2240 for (VPRecipeBase &R : RemovedSucc->phis())
2241 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2242
2243 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2244 // automatically on VPlan destruction if it becomes unreachable.
2245 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2246 VPBB->back().eraseFromParent();
2247 }
2248}
2249
2268
2269// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2270// the loop terminator with a branch-on-cond recipe with the negated
2271// active-lane-mask as operand. Note that this turns the loop into an
2272// uncountable one. Only the existing terminator is replaced, all other existing
2273// recipes/users remain unchanged, except for poison-generating flags being
2274// dropped from the canonical IV increment. Return the created
2275// VPActiveLaneMaskPHIRecipe.
2276//
2277// The function uses the following definitions:
2278//
2279// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2280// calculate-trip-count-minus-VF (original TC) : original TC
2281// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2282// CanonicalIVPhi : CanonicalIVIncrement
2283// %StartV is the canonical induction start value.
2284//
2285// The function adds the following recipes:
2286//
2287// vector.ph:
2288// %TripCount = calculate-trip-count-minus-VF (original TC)
2289// [if DataWithControlFlowWithoutRuntimeCheck]
2290// %EntryInc = canonical-iv-increment-for-part %StartV
2291// %EntryALM = active-lane-mask %EntryInc, %TripCount
2292//
2293// vector.body:
2294// ...
2295// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2296// ...
2297// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2298// %ALM = active-lane-mask %InLoopInc, TripCount
2299// %Negated = Not %ALM
2300// branch-on-cond %Negated
2301//
2304 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2305 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2306 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2307 VPValue *StartV = CanonicalIVPHI->getStartValue();
2308
2309 auto *CanonicalIVIncrement =
2310 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2311 // TODO: Check if dropping the flags is needed if
2312 // !DataAndControlFlowWithoutRuntimeCheck.
2313 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2314 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2315 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2316 // we have to take unrolling into account. Each part needs to start at
2317 // Part * VF
2318 auto *VecPreheader = Plan.getVectorPreheader();
2319 VPBuilder Builder(VecPreheader);
2320
2321 // Create the ActiveLaneMask instruction using the correct start values.
2322 VPValue *TC = Plan.getTripCount();
2323
2324 VPValue *TripCount, *IncrementValue;
2326 // When the loop is guarded by a runtime overflow check for the loop
2327 // induction variable increment by VF, we can increment the value before
2328 // the get.active.lane mask and use the unmodified tripcount.
2329 IncrementValue = CanonicalIVIncrement;
2330 TripCount = TC;
2331 } else {
2332 // When avoiding a runtime check, the active.lane.mask inside the loop
2333 // uses a modified trip count and the induction variable increment is
2334 // done after the active.lane.mask intrinsic is called.
2335 IncrementValue = CanonicalIVPHI;
2336 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2337 {TC}, DL);
2338 }
2339 auto *EntryIncrement = Builder.createOverflowingOp(
2340 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2341 "index.part.next");
2342
2343 // Create the active lane mask instruction in the VPlan preheader.
2344 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2345 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2346 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2347 {EntryIncrement, TC, ALMMultiplier}, DL,
2348 "active.lane.mask.entry");
2349
2350 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2351 // preheader ActiveLaneMask instruction.
2352 auto *LaneMaskPhi =
2354 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2355
2356 // Create the active lane mask for the next iteration of the loop before the
2357 // original terminator.
2358 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2359 Builder.setInsertPoint(OriginalTerminator);
2360 auto *InLoopIncrement =
2361 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2362 {IncrementValue}, {false, false}, DL);
2363 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2364 {InLoopIncrement, TripCount, ALMMultiplier},
2365 DL, "active.lane.mask.next");
2366 LaneMaskPhi->addOperand(ALM);
2367
2368 // Replace the original terminator with BranchOnCond. We have to invert the
2369 // mask here because a true condition means jumping to the exit block.
2370 auto *NotMask = Builder.createNot(ALM, DL);
2371 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2372 OriginalTerminator->eraseFromParent();
2373 return LaneMaskPhi;
2374}
2375
2376/// Collect the header mask with the pattern:
2377/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2378/// TODO: Introduce explicit recipe for header-mask instead of searching
2379/// for the header-mask pattern manually.
2381 SmallVector<VPValue *> WideCanonicalIVs;
2382 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2386 "Must have at most one VPWideCanonicalIVRecipe");
2387 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2388 auto *WideCanonicalIV =
2389 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2390 WideCanonicalIVs.push_back(WideCanonicalIV);
2391 }
2392
2393 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2394 // version of the canonical induction.
2395 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2396 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2397 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2398 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2399 WideCanonicalIVs.push_back(WidenOriginalIV);
2400 }
2401
2402 // Walk users of wide canonical IVs and find the single compare of the form
2403 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2404 VPSingleDefRecipe *HeaderMask = nullptr;
2405 for (auto *Wide : WideCanonicalIVs) {
2406 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2407 auto *VPI = dyn_cast<VPInstruction>(U);
2408 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2409 continue;
2410
2411 assert(VPI->getOperand(0) == Wide &&
2412 "WidenCanonicalIV must be the first operand of the compare");
2413 assert(!HeaderMask && "Multiple header masks found?");
2414 HeaderMask = VPI;
2415 }
2416 }
2417 return HeaderMask;
2418}
2419
2421 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2424 UseActiveLaneMaskForControlFlow) &&
2425 "DataAndControlFlowWithoutRuntimeCheck implies "
2426 "UseActiveLaneMaskForControlFlow");
2427
2428 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2430 assert(FoundWidenCanonicalIVUser &&
2431 "Must have widened canonical IV when tail folding!");
2432 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2433 auto *WideCanonicalIV =
2434 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2435 VPSingleDefRecipe *LaneMask;
2436 if (UseActiveLaneMaskForControlFlow) {
2439 } else {
2440 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2441 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2442 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2443 LaneMask =
2444 B.createNaryOp(VPInstruction::ActiveLaneMask,
2445 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2446 nullptr, "active.lane.mask");
2447 }
2448
2449 // Walk users of WideCanonicalIV and replace the header mask of the form
2450 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2451 // removing the old one to ensure there is always only a single header mask.
2452 HeaderMask->replaceAllUsesWith(LaneMask);
2453 HeaderMask->eraseFromParent();
2454}
2455
2456/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2457/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2458/// recipe could be created.
2459/// \p HeaderMask Header Mask.
2460/// \p CurRecipe Recipe to be transform.
2461/// \p TypeInfo VPlan-based type analysis.
2462/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2463/// \p EVL The explicit vector length parameter of vector-predication
2464/// intrinsics.
2466 VPRecipeBase &CurRecipe,
2467 VPTypeAnalysis &TypeInfo,
2468 VPValue &AllOneMask, VPValue &EVL) {
2469 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2470 // header mask.
2471 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2472 assert(OrigMask && "Unmasked recipe when folding tail");
2473 // HeaderMask will be handled using EVL.
2474 VPValue *Mask;
2475 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2476 return Mask;
2477 return HeaderMask == OrigMask ? nullptr : OrigMask;
2478 };
2479
2480 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2481 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2482 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2483 if (!EndPtr)
2484 return Addr;
2485 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2486 "VPVectorEndPointerRecipe with non-VF VF operand?");
2487 assert(
2488 all_of(EndPtr->users(),
2489 [](VPUser *U) {
2490 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2491 }) &&
2492 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2493 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2494 EVLAddr->insertBefore(&CurRecipe);
2495 EVLAddr->setOperand(1, &EVL);
2496 return EVLAddr;
2497 };
2498
2501 VPValue *NewMask = GetNewMask(L->getMask());
2502 VPValue *NewAddr = GetNewAddr(L->getAddr());
2503 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2504 })
2505 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2506 VPValue *NewMask = GetNewMask(S->getMask());
2507 VPValue *NewAddr = GetNewAddr(S->getAddr());
2508 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2509 })
2510 .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2511 VPValue *NewMask = GetNewMask(IR->getMask());
2512 return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2513 })
2514 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2515 VPValue *NewMask = GetNewMask(Red->getCondOp());
2516 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2517 })
2518 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2519 VPValue *LHS, *RHS;
2520 // Transform select with a header mask condition
2521 // select(header_mask, LHS, RHS)
2522 // into vector predication merge.
2523 // vp.merge(all-true, LHS, RHS, EVL)
2524 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2525 m_VPValue(RHS))))
2526 return nullptr;
2527 // Use all true as the condition because this transformation is
2528 // limited to selects whose condition is a header mask.
2529 return new VPWidenIntrinsicRecipe(
2530 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2531 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2532 })
2533 .Default([&](VPRecipeBase *R) { return nullptr; });
2534}
2535
2536/// Replace recipes with their EVL variants.
2538 VPTypeAnalysis TypeInfo(Plan);
2539 VPValue *AllOneMask = Plan.getTrue();
2540 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2541 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2542
2543 assert(all_of(Plan.getVF().users(),
2546 "User of VF that we can't transform to EVL.");
2547 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2549 });
2550
2551 assert(all_of(Plan.getVFxUF().users(),
2552 [&Plan](VPUser *U) {
2553 return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),
2554 m_Specific(&Plan.getVFxUF()))) ||
2555 isa<VPWidenPointerInductionRecipe>(U);
2556 }) &&
2557 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2558 "increment of the canonical induction.");
2559 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2560 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2561 // canonical induction must not be updated.
2563 });
2564
2565 // Defer erasing recipes till the end so that we don't invalidate the
2566 // VPTypeAnalysis cache.
2568
2569 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2570 // contained.
2571 bool ContainsFORs =
2573 if (ContainsFORs) {
2574 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2575 VPValue *MaxEVL = &Plan.getVF();
2576 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2577 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2578 MaxEVL = Builder.createScalarZExtOrTrunc(
2579 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2580 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2581
2582 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2583 VPValue *PrevEVL = Builder.createScalarPhi(
2584 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2585
2588 for (VPRecipeBase &R : *VPBB) {
2589 VPValue *V1, *V2;
2590 if (!match(&R,
2592 m_VPValue(V1), m_VPValue(V2))))
2593 continue;
2594 VPValue *Imm = Plan.getOrAddLiveIn(
2597 Intrinsic::experimental_vp_splice,
2598 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2599 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2600 VPSplice->insertBefore(&R);
2601 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2602 ToErase.push_back(&R);
2603 }
2604 }
2605 }
2606
2607 VPValue *HeaderMask = findHeaderMask(Plan);
2608 if (!HeaderMask)
2609 return;
2610
2611 // Replace header masks with a mask equivalent to predicating by EVL:
2612 //
2613 // icmp ule widen-canonical-iv backedge-taken-count
2614 // ->
2615 // icmp ult step-vector, EVL
2616 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2617 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2618 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2619 VPValue *EVLMask = Builder.createICmp(
2621 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2622 HeaderMask->replaceAllUsesWith(EVLMask);
2623 ToErase.push_back(HeaderMask->getDefiningRecipe());
2624
2625 // Try to optimize header mask recipes away to their EVL variants.
2626 // TODO: Split optimizeMaskToEVL out and move into
2627 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2628 // tryToBuildVPlanWithVPRecipes beforehand.
2629 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2630 auto *CurRecipe = cast<VPRecipeBase>(U);
2631 VPRecipeBase *EVLRecipe =
2632 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2633 if (!EVLRecipe)
2634 continue;
2635
2636 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2637 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2638 "New recipe must define the same number of values as the "
2639 "original.");
2640 EVLRecipe->insertBefore(CurRecipe);
2642 EVLRecipe)) {
2643 for (unsigned I = 0; I < NumDefVal; ++I) {
2644 VPValue *CurVPV = CurRecipe->getVPValue(I);
2645 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2646 }
2647 }
2648 ToErase.push_back(CurRecipe);
2649 }
2650 // Remove dead EVL mask.
2651 if (EVLMask->getNumUsers() == 0)
2652 ToErase.push_back(EVLMask->getDefiningRecipe());
2653
2654 for (VPRecipeBase *R : reverse(ToErase)) {
2655 SmallVector<VPValue *> PossiblyDead(R->operands());
2656 R->eraseFromParent();
2657 for (VPValue *Op : PossiblyDead)
2659 }
2660}
2661
2662/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2663/// replaces all uses except the canonical IV increment of
2664/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2665/// is used only for loop iterations counting after this transformation.
2666///
2667/// The function uses the following definitions:
2668/// %StartV is the canonical induction start value.
2669///
2670/// The function adds the following recipes:
2671///
2672/// vector.ph:
2673/// ...
2674///
2675/// vector.body:
2676/// ...
2677/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2678/// [ %NextEVLIV, %vector.body ]
2679/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2680/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2681/// ...
2682/// %OpEVL = cast i32 %VPEVL to IVSize
2683/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2684/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2685/// ...
2686///
2687/// If MaxSafeElements is provided, the function adds the following recipes:
2688/// vector.ph:
2689/// ...
2690///
2691/// vector.body:
2692/// ...
2693/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2694/// [ %NextEVLIV, %vector.body ]
2695/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2696/// %cmp = cmp ult %AVL, MaxSafeElements
2697/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2698/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2699/// ...
2700/// %OpEVL = cast i32 %VPEVL to IVSize
2701/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2702/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2703/// ...
2704///
2706 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2708
2709 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2710 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2711 VPValue *StartV = CanonicalIVPHI->getStartValue();
2712
2713 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2714 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2715 EVLPhi->insertAfter(CanonicalIVPHI);
2716 VPBuilder Builder(Header, Header->getFirstNonPhi());
2717 // Create the AVL (application vector length), starting from TC -> 0 in steps
2718 // of EVL.
2719 VPPhi *AVLPhi = Builder.createScalarPhi(
2720 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2721 VPValue *AVL = AVLPhi;
2722
2723 if (MaxSafeElements) {
2724 // Support for MaxSafeDist for correct loop emission.
2725 VPValue *AVLSafe =
2726 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2727 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2728 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2729 "safe_avl");
2730 }
2731 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2733
2734 auto *CanonicalIVIncrement =
2735 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2736 Builder.setInsertPoint(CanonicalIVIncrement);
2737 VPValue *OpVPEVL = VPEVL;
2738
2739 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2740 OpVPEVL = Builder.createScalarZExtOrTrunc(
2741 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2742
2743 auto *NextEVLIV = Builder.createOverflowingOp(
2744 Instruction::Add, {OpVPEVL, EVLPhi},
2745 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2746 CanonicalIVIncrement->hasNoSignedWrap()},
2747 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2748 EVLPhi->addOperand(NextEVLIV);
2749
2750 VPValue *NextAVL = Builder.createOverflowingOp(
2751 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2752 DebugLoc::getCompilerGenerated(), "avl.next");
2753 AVLPhi->addOperand(NextAVL);
2754
2755 transformRecipestoEVLRecipes(Plan, *VPEVL);
2756
2757 // Replace all uses of VPCanonicalIVPHIRecipe by
2758 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2759 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2760 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2761 // TODO: support unroll factor > 1.
2762 Plan.setUF(1);
2763}
2764
2766 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2767 // There should be only one EVL PHI in the entire plan.
2768 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2769
2772 for (VPRecipeBase &R : VPBB->phis())
2773 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2774 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2775 EVLPhi = PhiR;
2776 }
2777
2778 // Early return if no EVL PHI is found.
2779 if (!EVLPhi)
2780 return;
2781
2782 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2783 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2784 VPValue *AVL;
2785 [[maybe_unused]] bool FoundAVL =
2786 match(EVLIncrement,
2787 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2788 assert(FoundAVL && "Didn't find AVL?");
2789
2790 // The AVL may be capped to a safe distance.
2791 VPValue *SafeAVL;
2792 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2793 AVL = SafeAVL;
2794
2795 VPValue *AVLNext;
2796 [[maybe_unused]] bool FoundAVLNext =
2798 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2799 assert(FoundAVLNext && "Didn't find AVL backedge?");
2800
2801 // Convert EVLPhi to concrete recipe.
2802 auto *ScalarR =
2803 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2804 EVLPhi->getDebugLoc(), "evl.based.iv");
2805 EVLPhi->replaceAllUsesWith(ScalarR);
2806 EVLPhi->eraseFromParent();
2807
2808 // Replace CanonicalIVInc with EVL-PHI increment.
2809 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2810 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2811 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2812 m_Specific(&Plan.getVFxUF()))) &&
2813 "Unexpected canonical iv");
2814 Backedge->replaceAllUsesWith(EVLIncrement);
2815
2816 // Remove unused phi and increment.
2817 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2818 CanonicalIVIncrement->eraseFromParent();
2819 CanonicalIV->eraseFromParent();
2820
2821 // Replace the use of VectorTripCount in the latch-exiting block.
2822 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2823 // After: (branch-on-cond eq AVLNext, 0)
2824
2825 VPBasicBlock *LatchExiting =
2826 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2827 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2828 // Skip single-iteration loop region
2829 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2830 return;
2831 assert(LatchExitingBr &&
2832 match(LatchExitingBr,
2833 m_BranchOnCount(m_VPValue(EVLIncrement),
2834 m_Specific(&Plan.getVectorTripCount()))) &&
2835 "Unexpected terminator in EVL loop");
2836
2837 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2838 VPBuilder Builder(LatchExitingBr);
2839 VPValue *Cmp =
2840 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2842 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2843 LatchExitingBr->eraseFromParent();
2844}
2845
2847 VPlan &Plan, PredicatedScalarEvolution &PSE,
2848 const DenseMap<Value *, const SCEV *> &StridesMap) {
2849 // Replace VPValues for known constant strides guaranteed by predicate scalar
2850 // evolution.
2851 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2852 auto *R = cast<VPRecipeBase>(&U);
2853 return R->getParent()->getParent() ||
2854 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2855 };
2856 ValueToSCEVMapTy RewriteMap;
2857 for (const SCEV *Stride : StridesMap.values()) {
2858 using namespace SCEVPatternMatch;
2859 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2860 const APInt *StrideConst;
2861 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2862 // Only handle constant strides for now.
2863 continue;
2864
2865 auto *CI =
2866 Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
2867 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2868 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2869
2870 // The versioned value may not be used in the loop directly but through a
2871 // sext/zext. Add new live-ins in those cases.
2872 for (Value *U : StrideV->users()) {
2874 continue;
2875 VPValue *StrideVPV = Plan.getLiveIn(U);
2876 if (!StrideVPV)
2877 continue;
2878 unsigned BW = U->getType()->getScalarSizeInBits();
2879 APInt C =
2880 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2881 VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
2882 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2883 }
2884 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
2885 }
2886
2887 for (VPRecipeBase &R : *Plan.getEntry()) {
2888 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
2889 if (!ExpSCEV)
2890 continue;
2891 const SCEV *ScevExpr = ExpSCEV->getSCEV();
2892 auto *NewSCEV =
2893 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
2894 if (NewSCEV != ScevExpr) {
2895 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
2896 ExpSCEV->replaceAllUsesWith(NewExp);
2897 if (Plan.getTripCount() == ExpSCEV)
2898 Plan.resetTripCount(NewExp);
2899 }
2900 }
2901}
2902
2904 VPlan &Plan,
2905 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2906 // Collect recipes in the backward slice of `Root` that may generate a poison
2907 // value that is used after vectorization.
2909 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2911 Worklist.push_back(Root);
2912
2913 // Traverse the backward slice of Root through its use-def chain.
2914 while (!Worklist.empty()) {
2915 VPRecipeBase *CurRec = Worklist.pop_back_val();
2916
2917 if (!Visited.insert(CurRec).second)
2918 continue;
2919
2920 // Prune search if we find another recipe generating a widen memory
2921 // instruction. Widen memory instructions involved in address computation
2922 // will lead to gather/scatter instructions, which don't need to be
2923 // handled.
2925 VPHeaderPHIRecipe>(CurRec))
2926 continue;
2927
2928 // This recipe contributes to the address computation of a widen
2929 // load/store. If the underlying instruction has poison-generating flags,
2930 // drop them directly.
2931 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2932 VPValue *A, *B;
2933 // Dropping disjoint from an OR may yield incorrect results, as some
2934 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2935 // for dependence analysis). Instead, replace it with an equivalent Add.
2936 // This is possible as all users of the disjoint OR only access lanes
2937 // where the operands are disjoint or poison otherwise.
2938 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
2939 RecWithFlags->isDisjoint()) {
2940 VPBuilder Builder(RecWithFlags);
2941 VPInstruction *New = Builder.createOverflowingOp(
2942 Instruction::Add, {A, B}, {false, false},
2943 RecWithFlags->getDebugLoc());
2944 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2945 RecWithFlags->replaceAllUsesWith(New);
2946 RecWithFlags->eraseFromParent();
2947 CurRec = New;
2948 } else
2949 RecWithFlags->dropPoisonGeneratingFlags();
2950 } else {
2953 (void)Instr;
2954 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2955 "found instruction with poison generating flags not covered by "
2956 "VPRecipeWithIRFlags");
2957 }
2958
2959 // Add new definitions to the worklist.
2960 for (VPValue *Operand : CurRec->operands())
2961 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2962 Worklist.push_back(OpDef);
2963 }
2964 });
2965
2966 // Traverse all the recipes in the VPlan and collect the poison-generating
2967 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2968 // VPInterleaveRecipe.
2969 auto Iter = vp_depth_first_deep(Plan.getEntry());
2971 for (VPRecipeBase &Recipe : *VPBB) {
2972 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
2973 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2974 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2975 if (AddrDef && WidenRec->isConsecutive() &&
2976 BlockNeedsPredication(UnderlyingInstr.getParent()))
2977 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2978 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
2979 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2980 if (AddrDef) {
2981 // Check if any member of the interleave group needs predication.
2982 const InterleaveGroup<Instruction> *InterGroup =
2983 InterleaveRec->getInterleaveGroup();
2984 bool NeedPredication = false;
2985 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2986 I < NumMembers; ++I) {
2987 Instruction *Member = InterGroup->getMember(I);
2988 if (Member)
2989 NeedPredication |= BlockNeedsPredication(Member->getParent());
2990 }
2991
2992 if (NeedPredication)
2993 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2994 }
2995 }
2996 }
2997 }
2998}
2999
3001 VPlan &Plan,
3003 &InterleaveGroups,
3004 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3005 if (InterleaveGroups.empty())
3006 return;
3007
3008 // Interleave memory: for each Interleave Group we marked earlier as relevant
3009 // for this VPlan, replace the Recipes widening its memory instructions with a
3010 // single VPInterleaveRecipe at its insertion point.
3011 VPDominatorTree VPDT;
3012 VPDT.recalculate(Plan);
3013 for (const auto *IG : InterleaveGroups) {
3014 auto *Start =
3015 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3016 VPIRMetadata InterleaveMD(*Start);
3017 SmallVector<VPValue *, 4> StoredValues;
3018 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3019 StoredValues.push_back(StoreR->getStoredValue());
3020 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3021 Instruction *MemberI = IG->getMember(I);
3022 if (!MemberI)
3023 continue;
3024 VPWidenMemoryRecipe *MemoryR =
3025 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3026 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3027 StoredValues.push_back(StoreR->getStoredValue());
3028 InterleaveMD.intersect(*MemoryR);
3029 }
3030
3031 bool NeedsMaskForGaps =
3032 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3033 (!StoredValues.empty() && !IG->isFull());
3034
3035 Instruction *IRInsertPos = IG->getInsertPos();
3036 auto *InsertPos =
3037 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3038
3040 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3041 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3042 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3043
3044 // Get or create the start address for the interleave group.
3045 VPValue *Addr = Start->getAddr();
3046 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3047 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3048 // We cannot re-use the address of member zero because it does not
3049 // dominate the insert position. Instead, use the address of the insert
3050 // position and create a PtrAdd adjusting it to the address of member
3051 // zero.
3052 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3053 // InsertPos or sink loads above zero members to join it.
3054 assert(IG->getIndex(IRInsertPos) != 0 &&
3055 "index of insert position shouldn't be zero");
3056 auto &DL = IRInsertPos->getDataLayout();
3057 APInt Offset(32,
3058 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3059 IG->getIndex(IRInsertPos),
3060 /*IsSigned=*/true);
3061 VPValue *OffsetVPV =
3062 Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
3063 VPBuilder B(InsertPos);
3064 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3065 }
3066 // If the group is reverse, adjust the index to refer to the last vector
3067 // lane instead of the first. We adjust the index from the first vector
3068 // lane, rather than directly getting the pointer for lane VF - 1, because
3069 // the pointer operand of the interleaved access is supposed to be uniform.
3070 if (IG->isReverse()) {
3071 auto *ReversePtr = new VPVectorEndPointerRecipe(
3072 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3073 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3074 ReversePtr->insertBefore(InsertPos);
3075 Addr = ReversePtr;
3076 }
3077 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3078 InsertPos->getMask(), NeedsMaskForGaps,
3079 InterleaveMD, InsertPos->getDebugLoc());
3080 VPIG->insertBefore(InsertPos);
3081
3082 unsigned J = 0;
3083 for (unsigned i = 0; i < IG->getFactor(); ++i)
3084 if (Instruction *Member = IG->getMember(i)) {
3085 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3086 if (!Member->getType()->isVoidTy()) {
3087 VPValue *OriginalV = MemberR->getVPSingleValue();
3088 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3089 J++;
3090 }
3091 MemberR->eraseFromParent();
3092 }
3093 }
3094}
3095
3096/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3097/// value, phi and backedge value. In the following example:
3098///
3099/// vector.ph:
3100/// Successor(s): vector loop
3101///
3102/// <x1> vector loop: {
3103/// vector.body:
3104/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3105/// ...
3106/// EMIT branch-on-count ...
3107/// No successors
3108/// }
3109///
3110/// WIDEN-INDUCTION will get expanded to:
3111///
3112/// vector.ph:
3113/// ...
3114/// vp<%induction.start> = ...
3115/// vp<%induction.increment> = ...
3116///
3117/// Successor(s): vector loop
3118///
3119/// <x1> vector loop: {
3120/// vector.body:
3121/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3122/// ...
3123/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3124/// EMIT branch-on-count ...
3125/// No successors
3126/// }
3127static void
3129 VPTypeAnalysis &TypeInfo) {
3130 VPlan *Plan = WidenIVR->getParent()->getPlan();
3131 VPValue *Start = WidenIVR->getStartValue();
3132 VPValue *Step = WidenIVR->getStepValue();
3133 VPValue *VF = WidenIVR->getVFValue();
3134 DebugLoc DL = WidenIVR->getDebugLoc();
3135
3136 // The value from the original loop to which we are mapping the new induction
3137 // variable.
3138 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3139
3140 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3143 // FIXME: The newly created binary instructions should contain nsw/nuw
3144 // flags, which can be found from the original scalar operations.
3145 VPIRFlags Flags;
3146 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3147 AddOp = Instruction::Add;
3148 MulOp = Instruction::Mul;
3149 } else {
3150 AddOp = ID.getInductionOpcode();
3151 MulOp = Instruction::FMul;
3152 Flags = ID.getInductionBinOp()->getFastMathFlags();
3153 }
3154
3155 // If the phi is truncated, truncate the start and step values.
3156 VPBuilder Builder(Plan->getVectorPreheader());
3157 Type *StepTy = TypeInfo.inferScalarType(Step);
3158 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3159 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3160 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3161 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3162 StepTy = Ty;
3163 }
3164
3165 // Construct the initial value of the vector IV in the vector loop preheader.
3166 Type *IVIntTy =
3168 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3169 if (StepTy->isFloatingPointTy())
3170 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3171
3172 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3173 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3174
3175 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3176 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3177 DebugLoc::getUnknown(), "induction");
3178
3179 // Create the widened phi of the vector IV.
3180 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
3181 WidenIVR->getDebugLoc(), "vec.ind");
3182 WidePHI->addOperand(Init);
3183 WidePHI->insertBefore(WidenIVR);
3184
3185 // Create the backedge value for the vector IV.
3186 VPValue *Inc;
3187 VPValue *Prev;
3188 // If unrolled, use the increment and prev value from the operands.
3189 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3190 Inc = SplatVF;
3191 Prev = WidenIVR->getLastUnrolledPartOperand();
3192 } else {
3193 if (VPRecipeBase *R = VF->getDefiningRecipe())
3194 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3195 // Multiply the vectorization factor by the step using integer or
3196 // floating-point arithmetic as appropriate.
3197 if (StepTy->isFloatingPointTy())
3198 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3199 DL);
3200 else
3201 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3202 TypeInfo.inferScalarType(VF), DL);
3203
3204 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3205 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3206 Prev = WidePHI;
3207 }
3208
3210 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3211 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3212 WidenIVR->getDebugLoc(), "vec.ind.next");
3213
3214 WidePHI->addOperand(Next);
3215
3216 WidenIVR->replaceAllUsesWith(WidePHI);
3217}
3218
3219/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3220/// initial value, phi and backedge value. In the following example:
3221///
3222/// <x1> vector loop: {
3223/// vector.body:
3224/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3225/// ...
3226/// EMIT branch-on-count ...
3227/// }
3228///
3229/// WIDEN-POINTER-INDUCTION will get expanded to:
3230///
3231/// <x1> vector loop: {
3232/// vector.body:
3233/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3234/// EMIT %mul = mul %stepvector, %step
3235/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3236/// ...
3237/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3238/// EMIT branch-on-count ...
3239/// }
3241 VPTypeAnalysis &TypeInfo) {
3242 VPlan *Plan = R->getParent()->getPlan();
3243 VPValue *Start = R->getStartValue();
3244 VPValue *Step = R->getStepValue();
3245 VPValue *VF = R->getVFValue();
3246
3247 assert(R->getInductionDescriptor().getKind() ==
3249 "Not a pointer induction according to InductionDescriptor!");
3250 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3251 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3252 "Recipe should have been replaced");
3253
3254 VPBuilder Builder(R);
3255 DebugLoc DL = R->getDebugLoc();
3256
3257 // Build a scalar pointer phi.
3258 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3259
3260 // Create actual address geps that use the pointer phi as base and a
3261 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3262 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3263 Type *StepTy = TypeInfo.inferScalarType(Step);
3264 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3265 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3266 VPValue *PtrAdd = Builder.createNaryOp(
3267 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3268 R->replaceAllUsesWith(PtrAdd);
3269
3270 // Create the backedge value for the scalar pointer phi.
3272 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3273 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3274 DL);
3275 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3276
3277 VPValue *InductionGEP =
3278 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3279 ScalarPtrPhi->addOperand(InductionGEP);
3280}
3281
3283 // Replace loop regions with explicity CFG.
3284 SmallVector<VPRegionBlock *> LoopRegions;
3286 vp_depth_first_deep(Plan.getEntry()))) {
3287 if (!R->isReplicator())
3288 LoopRegions.push_back(R);
3289 }
3290 for (VPRegionBlock *R : LoopRegions)
3291 R->dissolveToCFGLoop();
3292}
3293
3295 VPTypeAnalysis TypeInfo(Plan);
3298 vp_depth_first_deep(Plan.getEntry()))) {
3299 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3300 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3301 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3302 ToRemove.push_back(WidenIVR);
3303 continue;
3304 }
3305
3306 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3307 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3308 ToRemove.push_back(WidenIVR);
3309 continue;
3310 }
3311
3312 // Expand VPBlendRecipe into VPInstruction::Select.
3313 VPBuilder Builder(&R);
3314 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3315 VPValue *Select = Blend->getIncomingValue(0);
3316 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3317 Select = Builder.createSelect(Blend->getMask(I),
3318 Blend->getIncomingValue(I), Select,
3319 R.getDebugLoc(), "predphi");
3320 Blend->replaceAllUsesWith(Select);
3321 ToRemove.push_back(Blend);
3322 }
3323
3324 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3325 Expr->decompose();
3326 ToRemove.push_back(Expr);
3327 }
3328
3329 VPValue *VectorStep;
3330 VPValue *ScalarStep;
3332 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3333 continue;
3334
3335 // Expand WideIVStep.
3336 auto *VPI = cast<VPInstruction>(&R);
3337 Type *IVTy = TypeInfo.inferScalarType(VPI);
3338 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3340 ? Instruction::UIToFP
3341 : Instruction::Trunc;
3342 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3343 }
3344
3345 [[maybe_unused]] auto *ConstStep =
3346 ScalarStep->isLiveIn()
3348 : nullptr;
3349 assert(!ConstStep || ConstStep->getValue() != 1);
3350 (void)ConstStep;
3351 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3352 ScalarStep =
3353 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3354 }
3355
3356 VPIRFlags Flags;
3357 if (IVTy->isFloatingPointTy())
3358 Flags = {VPI->getFastMathFlags()};
3359
3360 unsigned MulOpc =
3361 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3362 VPInstruction *Mul = Builder.createNaryOp(
3363 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3364 VectorStep = Mul;
3365 VPI->replaceAllUsesWith(VectorStep);
3366 ToRemove.push_back(VPI);
3367 }
3368 }
3369
3370 for (VPRecipeBase *R : ToRemove)
3371 R->eraseFromParent();
3372}
3373
3375 VPBasicBlock *EarlyExitVPBB,
3376 VPlan &Plan,
3377 VPBasicBlock *HeaderVPBB,
3378 VPBasicBlock *LatchVPBB) {
3379 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3380 if (!EarlyExitVPBB->getSinglePredecessor() &&
3381 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3382 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3383 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3384 "unsupported early exit VPBB");
3385 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3386 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3387 // of the phis.
3388 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3389 cast<VPIRPhi>(&R)->swapOperands();
3390 }
3391
3392 VPBuilder Builder(LatchVPBB->getTerminator());
3393 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3394 assert(
3395 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
3396 "Terminator must be be BranchOnCond");
3397 VPValue *CondOfEarlyExitingVPBB =
3398 EarlyExitingVPBB->getTerminator()->getOperand(0);
3399 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3400 ? CondOfEarlyExitingVPBB
3401 : Builder.createNot(CondOfEarlyExitingVPBB);
3402
3403 // Split the middle block and have it conditionally branch to the early exit
3404 // block if CondToEarlyExit.
3405 VPValue *IsEarlyExitTaken =
3406 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3407 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3408 VPBasicBlock *VectorEarlyExitVPBB =
3409 Plan.createVPBasicBlock("vector.early.exit");
3410 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3411 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3412 NewMiddle->swapSuccessors();
3413
3414 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3415
3416 // Update the exit phis in the early exit block.
3417 VPBuilder MiddleBuilder(NewMiddle);
3418 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3419 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3420 auto *ExitIRI = cast<VPIRPhi>(&R);
3421 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3422 // a single predecessor and 1 if it has two.
3423 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3424 if (ExitIRI->getNumOperands() != 1) {
3425 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3426 // predecessor. Extract its last lane.
3427 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3428 }
3429
3430 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3431 if (!IncomingFromEarlyExit->isLiveIn()) {
3432 // Update the incoming value from the early exit.
3433 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3434 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3435 "first.active.lane");
3436 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3437 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3438 nullptr, "early.exit.value");
3439 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3440 }
3441 }
3442 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3443
3444 // Replace the condition controlling the non-early exit from the vector loop
3445 // with one exiting if either the original condition of the vector latch is
3446 // true or the early exit has been taken.
3447 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3448 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3449 "Unexpected terminator");
3450 auto *IsLatchExitTaken =
3451 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3452 LatchExitingBranch->getOperand(1));
3453 auto *AnyExitTaken = Builder.createNaryOp(
3454 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3455 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3456 LatchExitingBranch->eraseFromParent();
3457}
3458
3459/// This function tries convert extended in-loop reductions to
3460/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3461/// valid. The created recipe must be decomposed to its constituent
3462/// recipes before execution.
3463static VPExpressionRecipe *
3465 VFRange &Range) {
3466 Type *RedTy = Ctx.Types.inferScalarType(Red);
3467 VPValue *VecOp = Red->getVecOp();
3468
3469 // Clamp the range if using extended-reduction is profitable.
3470 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
3471 Type *SrcTy) -> bool {
3473 [&](ElementCount VF) {
3474 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3476 InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3477 Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
3478 CostKind);
3479 InstructionCost ExtCost =
3480 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3481 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3482 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3483 },
3484 Range);
3485 };
3486
3487 VPValue *A;
3488 // Match reduce(ext)).
3489 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3490 IsExtendedRedValidAndClampRange(
3491 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3492 cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
3493 Instruction::CastOps::ZExt,
3494 Ctx.Types.inferScalarType(A)))
3495 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3496
3497 return nullptr;
3498}
3499
3500/// This function tries convert extended in-loop reductions to
3501/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3502/// and valid. The created VPExpressionRecipe must be decomposed to its
3503/// constituent recipes before execution. Patterns of the
3504/// VPExpressionRecipe:
3505/// reduce.add(mul(...)),
3506/// reduce.add(mul(ext(A), ext(B))),
3507/// reduce.add(ext(mul(ext(A), ext(B)))).
3508static VPExpressionRecipe *
3510 VPCostContext &Ctx, VFRange &Range) {
3511 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3512 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3513 return nullptr;
3514
3515 Type *RedTy = Ctx.Types.inferScalarType(Red);
3516
3517 // Clamp the range if using multiply-accumulate-reduction is profitable.
3518 auto IsMulAccValidAndClampRange =
3519 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
3520 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
3522 [&](ElementCount VF) {
3524 Type *SrcTy =
3525 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3526 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3527 InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
3528 isZExt, Opcode, RedTy, SrcVecTy, CostKind);
3529 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3530 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3531 InstructionCost ExtCost = 0;
3532 if (Ext0)
3533 ExtCost += Ext0->computeCost(VF, Ctx);
3534 if (Ext1)
3535 ExtCost += Ext1->computeCost(VF, Ctx);
3536 if (OuterExt)
3537 ExtCost += OuterExt->computeCost(VF, Ctx);
3538
3539 return MulAccCost.isValid() &&
3540 MulAccCost < ExtCost + MulCost + RedCost;
3541 },
3542 Range);
3543 };
3544
3545 VPValue *VecOp = Red->getVecOp();
3546 VPValue *A, *B;
3547 // Try to match reduce.add(mul(...)).
3548 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3549 auto *RecipeA =
3550 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3551 auto *RecipeB =
3552 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3553 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3554
3555 // Match reduce.add(mul(ext, ext)).
3556 if (RecipeA && RecipeB &&
3557 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
3558 match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3559 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3560 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
3561 Instruction::CastOps::ZExt,
3562 Mul, RecipeA, RecipeB, nullptr)) {
3563 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3564 }
3565 // Match reduce.add(mul).
3566 if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
3567 return new VPExpressionRecipe(Mul, Red);
3568 }
3569 // Match reduce.add(ext(mul(ext(A), ext(B)))).
3570 // All extend recipes must have same opcode or A == B
3571 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
3573 m_ZExtOrSExt(m_VPValue()))))) {
3574 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3575 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3576 auto *Ext0 =
3577 cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
3578 auto *Ext1 =
3579 cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
3580 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3581 Ext0->getOpcode() == Ext1->getOpcode() &&
3582 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
3583 Instruction::CastOps::ZExt,
3584 Mul, Ext0, Ext1, Ext)) {
3585 auto *NewExt0 = new VPWidenCastRecipe(
3586 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3587 *Ext0, Ext0->getDebugLoc());
3588 NewExt0->insertBefore(Ext0);
3589
3590 VPWidenCastRecipe *NewExt1 = NewExt0;
3591 if (Ext0 != Ext1) {
3592 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3593 Ext->getResultType(), *Ext1, *Ext1,
3594 Ext1->getDebugLoc());
3595 NewExt1->insertBefore(Ext1);
3596 }
3597 Mul->setOperand(0, NewExt0);
3598 Mul->setOperand(1, NewExt1);
3599 Red->setOperand(1, Mul);
3600 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3601 }
3602 }
3603 return nullptr;
3604}
3605
3606/// This function tries to create abstract recipes from the reduction recipe for
3607/// following optimizations and cost estimation.
3609 VPCostContext &Ctx,
3610 VFRange &Range) {
3611 VPExpressionRecipe *AbstractR = nullptr;
3612 auto IP = std::next(Red->getIterator());
3613 auto *VPBB = Red->getParent();
3614 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3615 AbstractR = MulAcc;
3616 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3617 AbstractR = ExtRed;
3618 // Cannot create abstract inloop reduction recipes.
3619 if (!AbstractR)
3620 return;
3621
3622 AbstractR->insertBefore(*VPBB, IP);
3623 Red->replaceAllUsesWith(AbstractR);
3624}
3625
3636
3638 if (Plan.hasScalarVFOnly())
3639 return;
3640
3641#ifndef NDEBUG
3642 VPDominatorTree VPDT;
3643 VPDT.recalculate(Plan);
3644#endif
3645
3646 SmallVector<VPValue *> VPValues;
3649 append_range(VPValues, Plan.getLiveIns());
3650 for (VPRecipeBase &R : *Plan.getEntry())
3651 append_range(VPValues, R.definedValues());
3652
3653 auto *VectorPreheader = Plan.getVectorPreheader();
3654 for (VPValue *VPV : VPValues) {
3656 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3657 isa<Constant>(VPV->getLiveInIRValue())))
3658 continue;
3659
3660 // Add explicit broadcast at the insert point that dominates all users.
3661 VPBasicBlock *HoistBlock = VectorPreheader;
3662 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3663 for (VPUser *User : VPV->users()) {
3664 if (User->usesScalars(VPV))
3665 continue;
3666 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3667 HoistPoint = HoistBlock->begin();
3668 else
3669 assert(VPDT.dominates(VectorPreheader,
3670 cast<VPRecipeBase>(User)->getParent()) &&
3671 "All users must be in the vector preheader or dominated by it");
3672 }
3673
3674 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3675 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3676 VPV->replaceUsesWithIf(Broadcast,
3677 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3678 return Broadcast != &U && !U.usesScalars(VPV);
3679 });
3680 }
3681}
3682
3684 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3686 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3687 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3688
3689 VPValue *TC = Plan.getTripCount();
3690 // Skip cases for which the trip count may be non-trivial to materialize.
3691 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3692 // tail is required.
3693 if (!Plan.hasScalarTail() ||
3695 Plan.getScalarPreheader() ||
3696 !TC->isLiveIn())
3697 return;
3698
3699 // Materialize vector trip counts for constants early if it can simply
3700 // be computed as (Original TC / VF * UF) * VF * UF.
3701 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3702 // tail-folded loops.
3703 ScalarEvolution &SE = *PSE.getSE();
3704 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3705 if (!isa<SCEVConstant>(TCScev))
3706 return;
3707 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3708 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3709 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3710 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3711}
3712
3714 VPBasicBlock *VectorPH) {
3716 if (BTC->getNumUsers() == 0)
3717 return;
3718
3719 VPBuilder Builder(VectorPH, VectorPH->begin());
3720 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3721 auto *TCMO = Builder.createNaryOp(
3722 Instruction::Sub,
3723 {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
3724 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3725 BTC->replaceAllUsesWith(TCMO);
3726}
3727
3729 if (Plan.hasScalarVFOnly())
3730 return;
3731
3732 VPTypeAnalysis TypeInfo(Plan);
3733 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3734 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3736 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3737 vp_depth_first_shallow(LoopRegion->getEntry()));
3738 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3739 // VPInstructions, excluding ones in replicate regions. Those are not
3740 // materialized explicitly yet. Those vector users are still handled in
3741 // VPReplicateRegion::execute(), via shouldPack().
3742 // TODO: materialize build vectors for replicating recipes in replicating
3743 // regions.
3744 for (VPBasicBlock *VPBB :
3745 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3746 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3748 continue;
3749 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3750 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3751 VPRegionBlock *ParentRegion =
3753 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3754 };
3755 if ((isa<VPReplicateRecipe>(DefR) &&
3756 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3757 (isa<VPInstruction>(DefR) &&
3759 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3760 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3761 continue;
3762
3763 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3764 unsigned Opcode = ScalarTy->isStructTy()
3767 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3768 BuildVector->insertAfter(DefR);
3769
3770 DefR->replaceUsesWithIf(
3771 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3772 VPUser &U, unsigned) {
3773 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3774 });
3775 }
3776 }
3777}
3778
3780 VPBasicBlock *VectorPHVPBB,
3781 bool TailByMasking,
3782 bool RequiresScalarEpilogue) {
3783 VPValue &VectorTC = Plan.getVectorTripCount();
3784 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3785 // There's nothing to do if there are no users of the vector trip count or its
3786 // IR value has already been set.
3787 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3788 return;
3789
3790 VPValue *TC = Plan.getTripCount();
3791 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3792 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3793 VPValue *Step = &Plan.getVFxUF();
3794
3795 // If the tail is to be folded by masking, round the number of iterations N
3796 // up to a multiple of Step instead of rounding down. This is done by first
3797 // adding Step-1 and then rounding down. Note that it's ok if this addition
3798 // overflows: the vector induction variable will eventually wrap to zero given
3799 // that it starts at zero and its Step is a power of two; the loop will then
3800 // exit, with the last early-exit vector comparison also producing all-true.
3801 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3802 // is accounted for in emitIterationCountCheck that adds an overflow check.
3803 if (TailByMasking) {
3804 TC = Builder.createNaryOp(
3805 Instruction::Add,
3806 {TC, Builder.createNaryOp(
3807 Instruction::Sub,
3808 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3809 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3810 }
3811
3812 // Now we need to generate the expression for the part of the loop that the
3813 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3814 // iterations are not required for correctness, or N - Step, otherwise. Step
3815 // is equal to the vectorization factor (number of SIMD elements) times the
3816 // unroll factor (number of SIMD instructions).
3817 VPValue *R =
3818 Builder.createNaryOp(Instruction::URem, {TC, Step},
3819 DebugLoc::getCompilerGenerated(), "n.mod.vf");
3820
3821 // There are cases where we *must* run at least one iteration in the remainder
3822 // loop. See the cost model for when this can happen. If the step evenly
3823 // divides the trip count, we set the remainder to be equal to the step. If
3824 // the step does not evenly divide the trip count, no adjustment is necessary
3825 // since there will already be scalar iterations. Note that the minimum
3826 // iterations check ensures that N >= Step.
3827 if (RequiresScalarEpilogue) {
3828 assert(!TailByMasking &&
3829 "requiring scalar epilogue is not supported with fail folding");
3830 VPValue *IsZero = Builder.createICmp(
3831 CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
3832 R = Builder.createSelect(IsZero, Step, R);
3833 }
3834
3835 VPValue *Res = Builder.createNaryOp(
3836 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
3837 VectorTC.replaceAllUsesWith(Res);
3838}
3839
3841 ElementCount VFEC) {
3842 VPBuilder Builder(VectorPH, VectorPH->begin());
3843 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3844 VPValue &VF = Plan.getVF();
3845 VPValue &VFxUF = Plan.getVFxUF();
3846 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
3847 // used.
3848 // TODO: Assert that they aren't used.
3849
3850 // If there are no users of the runtime VF, compute VFxUF by constant folding
3851 // the multiplication of VF and UF.
3852 if (VF.getNumUsers() == 0) {
3853 VPValue *RuntimeVFxUF =
3854 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
3855 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
3856 return;
3857 }
3858
3859 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
3860 // vscale) * UF.
3861 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
3863 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
3865 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
3866 }
3867 VF.replaceAllUsesWith(RuntimeVF);
3868
3869 VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
3870 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
3871 VFxUF.replaceAllUsesWith(MulByUF);
3872}
3873
3876 const DataLayout &DL = SE.getDataLayout();
3877 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
3878
3879 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
3880 BasicBlock *EntryBB = Entry->getIRBasicBlock();
3881 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
3882 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
3884 continue;
3885 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3886 if (!ExpSCEV)
3887 break;
3888 const SCEV *Expr = ExpSCEV->getSCEV();
3889 Value *Res =
3890 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
3891 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
3892 VPValue *Exp = Plan.getOrAddLiveIn(Res);
3893 ExpSCEV->replaceAllUsesWith(Exp);
3894 if (Plan.getTripCount() == ExpSCEV)
3895 Plan.resetTripCount(Exp);
3896 ExpSCEV->eraseFromParent();
3897 }
3899 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
3900 "after any VPIRInstructions");
3901 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
3902 // to the VPIRBasicBlock.
3903 auto EI = Entry->begin();
3904 for (Instruction &I : drop_end(*EntryBB)) {
3905 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
3906 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
3907 EI++;
3908 continue;
3909 }
3911 }
3912
3913 return ExpandedSCEVs;
3914}
3915
3916/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3917/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
3918/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
3919/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
3920/// an index-independent load if it feeds all wide ops at all indices (\p OpV
3921/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
3922/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3923/// is defined at \p Idx of a load interleave group.
3924static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
3925 VPValue *OpV, unsigned Idx) {
3926 auto *DefR = OpV->getDefiningRecipe();
3927 if (!DefR)
3928 return WideMember0->getOperand(OpIdx) == OpV;
3929 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
3930 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
3931
3932 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
3933 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
3934 return false;
3935}
3936
3937/// Returns true if \p IR is a full interleave group with factor and number of
3938/// members both equal to \p VF. The interleave group must also access the full
3939/// vector width \p VectorRegWidth.
3941 unsigned VF, VPTypeAnalysis &TypeInfo,
3942 unsigned VectorRegWidth) {
3943 if (!InterleaveR)
3944 return false;
3945
3946 Type *GroupElementTy = nullptr;
3947 if (InterleaveR->getStoredValues().empty()) {
3948 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
3949 if (!all_of(InterleaveR->definedValues(),
3950 [&TypeInfo, GroupElementTy](VPValue *Op) {
3951 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3952 }))
3953 return false;
3954 } else {
3955 GroupElementTy =
3956 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
3957 if (!all_of(InterleaveR->getStoredValues(),
3958 [&TypeInfo, GroupElementTy](VPValue *Op) {
3959 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3960 }))
3961 return false;
3962 }
3963
3964 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3965 auto IG = InterleaveR->getInterleaveGroup();
3966 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3967 GroupSize == VectorRegWidth;
3968}
3969
3970/// Returns true if \p VPValue is a narrow VPValue.
3971static bool isAlreadyNarrow(VPValue *VPV) {
3972 if (VPV->isLiveIn())
3973 return true;
3974 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
3975 return RepR && RepR->isSingleScalar();
3976}
3977
3979 unsigned VectorRegWidth) {
3980 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3981 if (!VectorLoop)
3982 return;
3983
3984 VPTypeAnalysis TypeInfo(Plan);
3985
3986 unsigned VFMinVal = VF.getKnownMinValue();
3988 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3991 continue;
3992
3995 continue;
3996
3997 // Bail out on recipes not supported at the moment:
3998 // * phi recipes other than the canonical induction
3999 // * recipes writing to memory except interleave groups
4000 // Only support plans with a canonical induction phi.
4001 if (R.isPhi())
4002 return;
4003
4004 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4005 if (R.mayWriteToMemory() && !InterleaveR)
4006 return;
4007
4008 // Do not narrow interleave groups if there are VectorPointer recipes and
4009 // the plan was unrolled. The recipe implicitly uses VF from
4010 // VPTransformState.
4011 // TODO: Remove restriction once the VF for the VectorPointer offset is
4012 // modeled explicitly as operand.
4013 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4014 return;
4015
4016 // All other ops are allowed, but we reject uses that cannot be converted
4017 // when checking all allowed consumers (store interleave groups) below.
4018 if (!InterleaveR)
4019 continue;
4020
4021 // Bail out on non-consecutive interleave groups.
4022 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4023 VectorRegWidth))
4024 return;
4025
4026 // Skip read interleave groups.
4027 if (InterleaveR->getStoredValues().empty())
4028 continue;
4029
4030 // Narrow interleave groups, if all operands are already matching narrow
4031 // ops.
4032 auto *Member0 = InterleaveR->getStoredValues()[0];
4033 if (isAlreadyNarrow(Member0) &&
4034 all_of(InterleaveR->getStoredValues(),
4035 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4036 StoreGroups.push_back(InterleaveR);
4037 continue;
4038 }
4039
4040 // For now, we only support full interleave groups storing load interleave
4041 // groups.
4042 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4043 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4044 if (!DefR)
4045 return false;
4046 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4047 return IR && IR->getInterleaveGroup()->isFull() &&
4048 IR->getVPValue(Op.index()) == Op.value();
4049 })) {
4050 StoreGroups.push_back(InterleaveR);
4051 continue;
4052 }
4053
4054 // Check if all values feeding InterleaveR are matching wide recipes, which
4055 // operands that can be narrowed.
4056 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
4057 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
4058 if (!WideMember0)
4059 return;
4060 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4061 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4062 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4063 R->getNumOperands() > 2)
4064 return;
4065 if (any_of(enumerate(R->operands()),
4066 [WideMember0, Idx = I](const auto &P) {
4067 const auto &[OpIdx, OpV] = P;
4068 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4069 }))
4070 return;
4071 }
4072 StoreGroups.push_back(InterleaveR);
4073 }
4074
4075 if (StoreGroups.empty())
4076 return;
4077
4078 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4079 SmallPtrSet<VPValue *, 4> NarrowedOps;
4080 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
4081 auto *R = V->getDefiningRecipe();
4082 if (!R || NarrowedOps.contains(V))
4083 return V;
4084 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4085 // Narrow interleave group to wide load, as transformed VPlan will only
4086 // process one original iteration.
4087 auto *L = new VPWidenLoadRecipe(
4088 *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
4089 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4090 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4091 L->insertBefore(LoadGroup);
4092 NarrowedOps.insert(L);
4093 return L;
4094 }
4095
4096 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4097 assert(RepR->isSingleScalar() &&
4098 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4099 "must be a single scalar load");
4100 NarrowedOps.insert(RepR);
4101 return RepR;
4102 }
4103 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4104
4105 VPValue *PtrOp = WideLoad->getAddr();
4106 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4107 PtrOp = VecPtr->getOperand(0);
4108 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4109 // process one original iteration.
4110 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4111 /*IsUniform*/ true,
4112 /*Mask*/ nullptr, *WideLoad);
4113 N->insertBefore(WideLoad);
4114 NarrowedOps.insert(N);
4115 return N;
4116 };
4117
4118 // Narrow operation tree rooted at store groups.
4119 for (auto *StoreGroup : StoreGroups) {
4120 VPValue *Res = nullptr;
4121 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4122 if (isAlreadyNarrow(Member0)) {
4123 Res = Member0;
4124 } else if (auto *WideMember0 =
4126 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4127 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4128 Res = WideMember0;
4129 } else {
4130 Res = NarrowOp(Member0);
4131 }
4132
4133 auto *S = new VPWidenStoreRecipe(
4134 *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
4135 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4136 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4137 S->insertBefore(StoreGroup);
4138 StoreGroup->eraseFromParent();
4139 }
4140
4141 // Adjust induction to reflect that the transformed plan only processes one
4142 // original iteration.
4143 auto *CanIV = Plan.getCanonicalIV();
4144 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4145 VPBuilder PHBuilder(Plan.getVectorPreheader());
4146
4147 VPValue *UF = Plan.getOrAddLiveIn(
4148 ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
4149 if (VF.isScalable()) {
4150 VPValue *VScale = PHBuilder.createElementCount(
4151 CanIV->getScalarType(), ElementCount::getScalable(1));
4152 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4153 Inc->setOperand(1, VScaleUF);
4154 Plan.getVF().replaceAllUsesWith(VScale);
4155 } else {
4156 Inc->setOperand(1, UF);
4158 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
4159 }
4160 removeDeadRecipes(Plan);
4161}
4162
4163/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4164/// BranchOnCond recipe.
4166 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4167 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4168 auto *MiddleTerm =
4170 // Only add branch metadata if there is a (conditional) terminator.
4171 if (!MiddleTerm)
4172 return;
4173
4174 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4175 "must have a BranchOnCond");
4176 // Assume that `TripCount % VectorStep ` is equally distributed.
4177 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4178 if (VF.isScalable() && VScaleForTuning.has_value())
4179 VectorStep *= *VScaleForTuning;
4180 assert(VectorStep > 0 && "trip count should not be zero");
4181 MDBuilder MDB(Plan.getContext());
4182 MDNode *BranchWeights =
4183 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4184 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4185}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:381
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static Value * tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1567
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1077
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3477
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3764
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3839
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3791
iterator end()
Definition VPlan.h:3801
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3799
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3852
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:246
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:619
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:591
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:664
const VPRecipeBase & back() const
Definition VPlan.h:3813
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2403
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2437
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2427
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2443
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2423
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:232
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:253
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:172
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:191
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:210
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2934
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3420
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:3447
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3508
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:2979
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1977
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2025
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2014
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3917
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:3941
Class to record and manage LLVM IR flags.
Definition VPlan.h:600
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:942
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:983
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1060
@ FirstOrderRecurrenceSplice
Definition VPlan.h:989
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1013
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1010
@ CanonicalIVIncrementForPart
Definition VPlan.h:1003
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2544
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2565
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2617
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2576
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3091
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:394
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition VPlan.h:477
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPBasicBlock * getParent()
Definition VPlan.h:415
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:482
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2812
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2666
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:3952
const VPBlockBase * getEntry() const
Definition VPlan.h:3988
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4005
const VPBlockBase * getExiting() const
Definition VPlan.h:4000
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4013
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2856
bool isSingleScalar() const
Definition VPlan.h:2901
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2925
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3654
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:521
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:586
virtual VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
operand_iterator op_end()
Definition VPlanValue.h:265
operand_iterator op_begin()
Definition VPlanValue.h:263
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:186
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1403
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1407
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1841
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1885
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3549
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1482
A recipe for handling GEP instructions.
Definition VPlan.h:1769
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2042
PHINode * getPHINode() const
Definition VPlan.h:2084
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2070
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2087
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2117
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2198
A recipe for widening vector intrinsics.
Definition VPlan.h:1540
A common base class for widening memory operations.
Definition VPlan.h:3133
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3195
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3188
A recipe for widened phis.
Definition VPlan.h:2253
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1439
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4055
bool hasVF(ElementCount VF) const
Definition VPlan.h:4264
LLVMContext & getContext() const
Definition VPlan.h:4252
VPBasicBlock * getEntry()
Definition VPlan.h:4154
VPRegionBlock * createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="", bool IsReplicator=false)
Create a new VPRegionBlock with Entry, Exiting and Name.
Definition VPlan.h:4395
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4244
bool hasScalableVF() const
Definition VPlan.h:4265
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4250
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4247
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4216
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4321
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4237
unsigned getUF() const
Definition VPlan.h:4284
bool hasUF(unsigned UF) const
Definition VPlan.h:4282
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4206
void setVF(ElementCount VF)
Definition VPlan.h:4258
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4297
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1037
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4230
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4179
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4385
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4306
bool hasScalarVFOnly() const
Definition VPlan.h:4275
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4197
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4336
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition VPlan.h:4360
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4202
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4333
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4159
void setUF(unsigned UF)
Definition VPlan.h:4289
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4437
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount, Op0_t, Op1_t > m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::BranchOnCond, Op0_t > m_BranchOnCond(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition VPlanUtils.h:44
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool isHeaderMask(const VPValue *V, VPlan &Plan)
Return true if V is a header mask in Plan.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:720
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:325
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2068
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:77
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:836
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2296
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3255
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3215
A recipe for widening select instructions.
Definition VPlan.h:1723
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3337
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3295
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.