LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlan &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
95 Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 VPIRMetadata(*Store), Ingredient.getDebugLoc());
102 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
103 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
104 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
105 if (VectorID == Intrinsic::not_intrinsic)
106 return false;
107 NewRecipe = new VPWidenIntrinsicRecipe(
108 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
109 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
110 CI->getDebugLoc());
111 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
112 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
113 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
114 NewRecipe = new VPWidenCastRecipe(
115 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
116 } else {
117 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
118 }
119 }
120
121 NewRecipe->insertBefore(&Ingredient);
122 if (NewRecipe->getNumDefinedValues() == 1)
123 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
124 else
125 assert(NewRecipe->getNumDefinedValues() == 0 &&
126 "Only recpies with zero or one defined values expected");
127 Ingredient.eraseFromParent();
128 }
129 }
130 return true;
131}
132
133static bool sinkScalarOperands(VPlan &Plan) {
134 auto Iter = vp_depth_first_deep(Plan.getEntry());
135 bool Changed = false;
136 // First, collect the operands of all recipes in replicate blocks as seeds for
137 // sinking.
140 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
141 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
142 continue;
143 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
144 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
145 continue;
146 for (auto &Recipe : *VPBB) {
147 for (VPValue *Op : Recipe.operands())
148 if (auto *Def =
149 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
150 WorkList.insert({VPBB, Def});
151 }
152 }
153
154 bool ScalarVFOnly = Plan.hasScalarVFOnly();
155 // Try to sink each replicate or scalar IV steps recipe in the worklist.
156 for (unsigned I = 0; I != WorkList.size(); ++I) {
157 VPBasicBlock *SinkTo;
158 VPSingleDefRecipe *SinkCandidate;
159 std::tie(SinkTo, SinkCandidate) = WorkList[I];
160 if (SinkCandidate->getParent() == SinkTo ||
161 SinkCandidate->mayHaveSideEffects() ||
162 SinkCandidate->mayReadOrWriteMemory())
163 continue;
164 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
165 if (!ScalarVFOnly && RepR->isSingleScalar())
166 continue;
167 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
168 continue;
169
170 bool NeedsDuplicating = false;
171 // All recipe users of the sink candidate must be in the same block SinkTo
172 // or all users outside of SinkTo must be uniform-after-vectorization (
173 // i.e., only first lane is used) . In the latter case, we need to duplicate
174 // SinkCandidate.
175 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
176 SinkCandidate](VPUser *U) {
177 auto *UI = cast<VPRecipeBase>(U);
178 if (UI->getParent() == SinkTo)
179 return true;
180 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
181 // We only know how to duplicate VPReplicateRecipes and
182 // VPScalarIVStepsRecipes for now.
183 return NeedsDuplicating &&
185 };
186 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
187 continue;
188
189 if (NeedsDuplicating) {
190 if (ScalarVFOnly)
191 continue;
192 VPSingleDefRecipe *Clone;
193 if (auto *SinkCandidateRepR =
194 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
195 // TODO: Handle converting to uniform recipes as separate transform,
196 // then cloning should be sufficient here.
197 Instruction *I = SinkCandidate->getUnderlyingInstr();
198 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
199 nullptr /*Mask*/, *SinkCandidateRepR);
200 // TODO: add ".cloned" suffix to name of Clone's VPValue.
201 } else {
202 Clone = SinkCandidate->clone();
203 }
204
205 Clone->insertBefore(SinkCandidate);
206 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
207 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
208 });
209 }
210 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
211 for (VPValue *Op : SinkCandidate->operands())
212 if (auto *Def =
213 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
214 WorkList.insert({SinkTo, Def});
215 Changed = true;
216 }
217 return Changed;
218}
219
220/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
221/// the mask.
223 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
224 if (!EntryBB || EntryBB->size() != 1 ||
225 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
226 return nullptr;
227
228 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
229}
230
231/// If \p R is a triangle region, return the 'then' block of the triangle.
233 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
234 if (EntryBB->getNumSuccessors() != 2)
235 return nullptr;
236
237 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
238 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
239 if (!Succ0 || !Succ1)
240 return nullptr;
241
242 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
243 return nullptr;
244 if (Succ0->getSingleSuccessor() == Succ1)
245 return Succ0;
246 if (Succ1->getSingleSuccessor() == Succ0)
247 return Succ1;
248 return nullptr;
249}
250
251// Merge replicate regions in their successor region, if a replicate region
252// is connected to a successor replicate region with the same predicate by a
253// single, empty VPBasicBlock.
255 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
256
257 // Collect replicate regions followed by an empty block, followed by another
258 // replicate region with matching masks to process front. This is to avoid
259 // iterator invalidation issues while merging regions.
262 vp_depth_first_deep(Plan.getEntry()))) {
263 if (!Region1->isReplicator())
264 continue;
265 auto *MiddleBasicBlock =
266 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
267 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
268 continue;
269
270 auto *Region2 =
271 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
272 if (!Region2 || !Region2->isReplicator())
273 continue;
274
275 VPValue *Mask1 = getPredicatedMask(Region1);
276 VPValue *Mask2 = getPredicatedMask(Region2);
277 if (!Mask1 || Mask1 != Mask2)
278 continue;
279
280 assert(Mask1 && Mask2 && "both region must have conditions");
281 WorkList.push_back(Region1);
282 }
283
284 // Move recipes from Region1 to its successor region, if both are triangles.
285 for (VPRegionBlock *Region1 : WorkList) {
286 if (TransformedRegions.contains(Region1))
287 continue;
288 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
289 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
290
291 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
292 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
293 if (!Then1 || !Then2)
294 continue;
295
296 // Note: No fusion-preventing memory dependencies are expected in either
297 // region. Such dependencies should be rejected during earlier dependence
298 // checks, which guarantee accesses can be re-ordered for vectorization.
299 //
300 // Move recipes to the successor region.
301 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
302 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
303
304 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
305 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
306
307 // Move VPPredInstPHIRecipes from the merge block to the successor region's
308 // merge block. Update all users inside the successor region to use the
309 // original values.
310 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
311 VPValue *PredInst1 =
312 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
313 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
314 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
315 return cast<VPRecipeBase>(&U)->getParent() == Then2;
316 });
317
318 // Remove phi recipes that are unused after merging the regions.
319 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
320 Phi1ToMove.eraseFromParent();
321 continue;
322 }
323 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
324 }
325
326 // Remove the dead recipes in Region1's entry block.
327 for (VPRecipeBase &R :
328 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
329 R.eraseFromParent();
330
331 // Finally, remove the first region.
332 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
333 VPBlockUtils::disconnectBlocks(Pred, Region1);
334 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
335 }
336 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
337 TransformedRegions.insert(Region1);
338 }
339
340 return !TransformedRegions.empty();
341}
342
344 VPlan &Plan) {
345 Instruction *Instr = PredRecipe->getUnderlyingInstr();
346 // Build the triangular if-then region.
347 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
348 assert(Instr->getParent() && "Predicated instruction not in any basic block");
349 auto *BlockInMask = PredRecipe->getMask();
350 auto *MaskDef = BlockInMask->getDefiningRecipe();
351 auto *BOMRecipe = new VPBranchOnMaskRecipe(
352 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
353 auto *Entry =
354 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
355
356 // Replace predicated replicate recipe with a replicate recipe without a
357 // mask but in the replicate region.
358 auto *RecipeWithoutMask = new VPReplicateRecipe(
359 PredRecipe->getUnderlyingInstr(),
360 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
361 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
362 auto *Pred =
363 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
364
365 VPPredInstPHIRecipe *PHIRecipe = nullptr;
366 if (PredRecipe->getNumUsers() != 0) {
367 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
368 RecipeWithoutMask->getDebugLoc());
369 PredRecipe->replaceAllUsesWith(PHIRecipe);
370 PHIRecipe->setOperand(0, RecipeWithoutMask);
371 }
372 PredRecipe->eraseFromParent();
373 auto *Exiting =
374 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
376 Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
377
378 // Note: first set Entry as region entry and then connect successors starting
379 // from it in order, to propagate the "parent" of each VPBasicBlock.
380 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
381 VPBlockUtils::connectBlocks(Pred, Exiting);
382
383 return Region;
384}
385
386static void addReplicateRegions(VPlan &Plan) {
389 vp_depth_first_deep(Plan.getEntry()))) {
390 for (VPRecipeBase &R : *VPBB)
391 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
392 if (RepR->isPredicated())
393 WorkList.push_back(RepR);
394 }
395 }
396
397 unsigned BBNum = 0;
398 for (VPReplicateRecipe *RepR : WorkList) {
399 VPBasicBlock *CurrentBlock = RepR->getParent();
400 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
401
402 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
403 SplitBlock->setName(
404 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
405 // Record predicated instructions for above packing optimizations.
407 Region->setParent(CurrentBlock->getParent());
409
410 VPRegionBlock *ParentRegion = Region->getParent();
411 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
412 ParentRegion->setExiting(SplitBlock);
413 }
414}
415
416/// Remove redundant VPBasicBlocks by merging them into their predecessor if
417/// the predecessor has a single successor.
421 vp_depth_first_deep(Plan.getEntry()))) {
422 // Don't fold the blocks in the skeleton of the Plan into their single
423 // predecessors for now.
424 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
425 if (!VPBB->getParent())
426 continue;
427 auto *PredVPBB =
428 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
429 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
430 isa<VPIRBasicBlock>(PredVPBB))
431 continue;
432 WorkList.push_back(VPBB);
433 }
434
435 for (VPBasicBlock *VPBB : WorkList) {
436 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
437 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
438 R.moveBefore(*PredVPBB, PredVPBB->end());
439 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
440 auto *ParentRegion = VPBB->getParent();
441 if (ParentRegion && ParentRegion->getExiting() == VPBB)
442 ParentRegion->setExiting(PredVPBB);
443 for (auto *Succ : to_vector(VPBB->successors())) {
445 VPBlockUtils::connectBlocks(PredVPBB, Succ);
446 }
447 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
448 }
449 return !WorkList.empty();
450}
451
453 // Convert masked VPReplicateRecipes to if-then region blocks.
455
456 bool ShouldSimplify = true;
457 while (ShouldSimplify) {
458 ShouldSimplify = sinkScalarOperands(Plan);
459 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
460 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
461 }
462}
463
464/// Remove redundant casts of inductions.
465///
466/// Such redundant casts are casts of induction variables that can be ignored,
467/// because we already proved that the casted phi is equal to the uncasted phi
468/// in the vectorized loop. There is no need to vectorize the cast - the same
469/// value can be used for both the phi and casts in the vector loop.
471 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
473 if (!IV || IV->getTruncInst())
474 continue;
475
476 // A sequence of IR Casts has potentially been recorded for IV, which
477 // *must be bypassed* when the IV is vectorized, because the vectorized IV
478 // will produce the desired casted value. This sequence forms a def-use
479 // chain and is provided in reverse order, ending with the cast that uses
480 // the IV phi. Search for the recipe of the last cast in the chain and
481 // replace it with the original IV. Note that only the final cast is
482 // expected to have users outside the cast-chain and the dead casts left
483 // over will be cleaned up later.
484 auto &Casts = IV->getInductionDescriptor().getCastInsts();
485 VPValue *FindMyCast = IV;
486 for (Instruction *IRCast : reverse(Casts)) {
487 VPSingleDefRecipe *FoundUserCast = nullptr;
488 for (auto *U : FindMyCast->users()) {
489 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
490 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
491 FoundUserCast = UserCast;
492 break;
493 }
494 }
495 FindMyCast = FoundUserCast;
496 }
497 FindMyCast->replaceAllUsesWith(IV);
498 }
499}
500
501/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
502/// recipe, if it exists.
504 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
505 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
506 for (VPUser *U : CanonicalIV->users()) {
508 if (WidenNewIV)
509 break;
510 }
511
512 if (!WidenNewIV)
513 return;
514
516 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
517 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
518
519 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
520 continue;
521
522 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
523 // everything WidenNewIV's users need. That is, WidenOriginalIV will
524 // generate a vector phi or all users of WidenNewIV demand the first lane
525 // only.
526 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
527 vputils::onlyFirstLaneUsed(WidenNewIV)) {
528 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
529 WidenNewIV->eraseFromParent();
530 return;
531 }
532 }
533}
534
535/// Returns true if \p R is dead and can be removed.
536static bool isDeadRecipe(VPRecipeBase &R) {
537 // Do remove conditional assume instructions as their conditions may be
538 // flattened.
539 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
540 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
542 if (IsConditionalAssume)
543 return true;
544
545 if (R.mayHaveSideEffects())
546 return false;
547
548 // Recipe is dead if no user keeps the recipe alive.
549 return all_of(R.definedValues(),
550 [](VPValue *V) { return V->getNumUsers() == 0; });
551}
552
555 vp_post_order_deep(Plan.getEntry()))) {
556 // The recipes in the block are processed in reverse order, to catch chains
557 // of dead recipes.
558 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
559 if (isDeadRecipe(R)) {
560 R.eraseFromParent();
561 continue;
562 }
563
564 // Check if R is a dead VPPhi <-> update cycle and remove it.
565 auto *PhiR = dyn_cast<VPPhi>(&R);
566 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
567 continue;
568 VPValue *Incoming = PhiR->getOperand(1);
569 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
570 Incoming->getNumUsers() != 1)
571 continue;
572 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
573 PhiR->eraseFromParent();
574 Incoming->getDefiningRecipe()->eraseFromParent();
575 }
576 }
577}
578
581 Instruction::BinaryOps InductionOpcode,
582 FPMathOperator *FPBinOp, Instruction *TruncI,
583 VPValue *StartV, VPValue *Step, DebugLoc DL,
584 VPBuilder &Builder) {
586 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
587 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
588 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
589
590 // Truncate base induction if needed.
591 VPTypeAnalysis TypeInfo(Plan);
592 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
593 if (TruncI) {
594 Type *TruncTy = TruncI->getType();
595 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
596 "Not truncating.");
597 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
598 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
599 ResultTy = TruncTy;
600 }
601
602 // Truncate step if needed.
603 Type *StepTy = TypeInfo.inferScalarType(Step);
604 if (ResultTy != StepTy) {
605 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
606 "Not truncating.");
607 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
608 auto *VecPreheader =
610 VPBuilder::InsertPointGuard Guard(Builder);
611 Builder.setInsertPoint(VecPreheader);
612 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
613 }
614 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
615 &Plan.getVF(), DL);
616}
617
620 for (unsigned I = 0; I != Users.size(); ++I) {
622 if (isa<VPHeaderPHIRecipe>(Cur))
623 continue;
624 for (VPValue *V : Cur->definedValues())
625 Users.insert_range(V->users());
626 }
627 return Users.takeVector();
628}
629
630/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
631/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
632/// VPWidenPointerInductionRecipe will generate vectors only. If some users
633/// require vectors while other require scalars, the scalar uses need to extract
634/// the scalars from the generated vectors (Note that this is different to how
635/// int/fp inductions are handled). Legalize extract-from-ends using uniform
636/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
637/// the correct end value is available. Also optimize
638/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
639/// providing them scalar steps built on the canonical scalar IV and update the
640/// original IV's users. This is an optional optimization to reduce the needs of
641/// vector extracts.
644 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
645 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
646 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
647 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
648 if (!PhiR)
649 continue;
650
651 // Try to narrow wide and replicating recipes to uniform recipes, based on
652 // VPlan analysis.
653 // TODO: Apply to all recipes in the future, to replace legacy uniformity
654 // analysis.
655 auto Users = collectUsersRecursively(PhiR);
656 for (VPUser *U : reverse(Users)) {
657 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
658 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
659 // Skip recipes that shouldn't be narrowed.
660 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
661 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
662 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
663 continue;
664
665 // Skip recipes that may have other lanes than their first used.
667 continue;
668
669 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
670 Def->operands(), /*IsUniform*/ true);
671 Clone->insertAfter(Def);
672 Def->replaceAllUsesWith(Clone);
673 }
674
675 // Replace wide pointer inductions which have only their scalars used by
676 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
677 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
678 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
679 continue;
680
681 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
682 VPValue *StartV =
683 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
684 VPValue *StepV = PtrIV->getOperand(1);
686 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
687 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
688
689 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
690 PtrIV->getDebugLoc(), "next.gep");
691
692 PtrIV->replaceAllUsesWith(PtrAdd);
693 continue;
694 }
695
696 // Replace widened induction with scalar steps for users that only use
697 // scalars.
698 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
699 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
700 return U->usesScalars(WideIV);
701 }))
702 continue;
703
704 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
706 Plan, ID.getKind(), ID.getInductionOpcode(),
707 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
708 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
709 WideIV->getDebugLoc(), Builder);
710
711 // Update scalar users of IV to use Step instead.
712 if (!HasOnlyVectorVFs)
713 WideIV->replaceAllUsesWith(Steps);
714 else
715 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
716 return U.usesScalars(WideIV);
717 });
718 }
719}
720
721/// Check if \p VPV is an untruncated wide induction, either before or after the
722/// increment. If so return the header IV (before the increment), otherwise
723/// return null.
725 ScalarEvolution &SE) {
726 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
727 if (WideIV) {
728 // VPV itself is a wide induction, separately compute the end value for exit
729 // users if it is not a truncated IV.
730 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
731 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
732 }
733
734 // Check if VPV is an optimizable induction increment.
735 VPRecipeBase *Def = VPV->getDefiningRecipe();
736 if (!Def || Def->getNumOperands() != 2)
737 return nullptr;
738 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
739 if (!WideIV)
740 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
741 if (!WideIV)
742 return nullptr;
743
744 auto IsWideIVInc = [&]() {
745 auto &ID = WideIV->getInductionDescriptor();
746
747 // Check if VPV increments the induction by the induction step.
748 VPValue *IVStep = WideIV->getStepValue();
749 switch (ID.getInductionOpcode()) {
750 case Instruction::Add:
751 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
752 case Instruction::FAdd:
754 m_Specific(IVStep)));
755 case Instruction::FSub:
756 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
757 m_Specific(IVStep)));
758 case Instruction::Sub: {
759 // IVStep will be the negated step of the subtraction. Check if Step == -1
760 // * IVStep.
761 VPValue *Step;
762 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
763 return false;
764 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
765 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
766 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
767 !isa<SCEVCouldNotCompute>(StepSCEV) &&
768 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
769 }
770 default:
771 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
772 match(VPV, m_GetElementPtr(m_Specific(WideIV),
773 m_Specific(WideIV->getStepValue())));
774 }
775 llvm_unreachable("should have been covered by switch above");
776 };
777 return IsWideIVInc() ? WideIV : nullptr;
778}
779
780/// Attempts to optimize the induction variable exit values for users in the
781/// early exit block.
783 VPTypeAnalysis &TypeInfo,
784 VPBlockBase *PredVPBB,
785 VPValue *Op,
786 ScalarEvolution &SE) {
787 VPValue *Incoming, *Mask;
790 m_VPValue(Mask)),
792 return nullptr;
793
794 auto *WideIV = getOptimizableIVOf(Incoming, SE);
795 if (!WideIV)
796 return nullptr;
797
798 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
799 if (WideIntOrFp && WideIntOrFp->getTruncInst())
800 return nullptr;
801
802 // Calculate the final index.
803 VPValue *EndValue = Plan.getCanonicalIV();
804 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
805 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
806
807 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
808 VPValue *FirstActiveLane =
809 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
810 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
811 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
812 FirstActiveLaneType, DL);
813 EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
814
815 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
816 // changed it means the exit is using the incremented value, so we need to
817 // add the step.
818 if (Incoming != WideIV) {
819 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
820 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
821 }
822
823 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
824 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
825 VPValue *Start = WideIV->getStartValue();
826 VPValue *Step = WideIV->getStepValue();
827 EndValue = B.createDerivedIV(
828 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
829 Start, EndValue, Step);
830 }
831
832 return EndValue;
833}
834
835/// Attempts to optimize the induction variable exit values for users in the
836/// exit block coming from the latch in the original scalar loop.
838 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
842 return nullptr;
843
844 auto *WideIV = getOptimizableIVOf(Incoming, SE);
845 if (!WideIV)
846 return nullptr;
847
848 VPValue *EndValue = EndValues.lookup(WideIV);
849 assert(EndValue && "end value must have been pre-computed");
850
851 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
852 // changed it means the exit is using the incremented value, so we don't
853 // need to subtract the step.
854 if (Incoming != WideIV)
855 return EndValue;
856
857 // Otherwise, subtract the step from the EndValue.
858 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
859 VPValue *Step = WideIV->getStepValue();
860 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
861 if (ScalarTy->isIntegerTy())
862 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
863 if (ScalarTy->isPointerTy()) {
864 Type *StepTy = TypeInfo.inferScalarType(Step);
865 auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
866 return B.createPtrAdd(EndValue,
867 B.createNaryOp(Instruction::Sub, {Zero, Step}),
868 DebugLoc::getUnknown(), "ind.escape");
869 }
870 if (ScalarTy->isFloatingPointTy()) {
871 const auto &ID = WideIV->getInductionDescriptor();
872 return B.createNaryOp(
873 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
874 ? Instruction::FSub
875 : Instruction::FAdd,
876 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
877 }
878 llvm_unreachable("all possible induction types must be handled");
879 return nullptr;
880}
881
883 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
884 ScalarEvolution &SE) {
885 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
886 VPTypeAnalysis TypeInfo(Plan);
887 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
888 for (VPRecipeBase &R : ExitVPBB->phis()) {
889 auto *ExitIRI = cast<VPIRPhi>(&R);
890
891 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
892 VPValue *Escape = nullptr;
893 if (PredVPBB == MiddleVPBB)
894 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
895 ExitIRI->getOperand(Idx),
896 EndValues, SE);
897 else
898 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
899 ExitIRI->getOperand(Idx), SE);
900 if (Escape)
901 ExitIRI->setOperand(Idx, Escape);
902 }
903 }
904 }
905}
906
907/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
908/// them with already existing recipes expanding the same SCEV expression.
911
912 for (VPRecipeBase &R :
914 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
915 if (!ExpR)
916 continue;
917
918 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
919 if (Inserted)
920 continue;
921 ExpR->replaceAllUsesWith(V->second);
922 ExpR->eraseFromParent();
923 }
924}
925
927 SmallVector<VPValue *> WorkList;
929 WorkList.push_back(V);
930
931 while (!WorkList.empty()) {
932 VPValue *Cur = WorkList.pop_back_val();
933 if (!Seen.insert(Cur).second)
934 continue;
936 if (!R)
937 continue;
938 if (!isDeadRecipe(*R))
939 continue;
940 WorkList.append(R->op_begin(), R->op_end());
941 R->eraseFromParent();
942 }
943}
944
945/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
946/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
947/// foldable live-ins.
948static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
950 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
952 for (VPValue *Op : Operands) {
953 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
954 return nullptr;
955 Ops.push_back(Op->getLiveInIRValue());
956 }
957
958 InstSimplifyFolder Folder(DL);
959 if (Instruction::isBinaryOp(Opcode))
960 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
961 Ops[1]);
962 if (Instruction::isCast(Opcode))
963 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
964 TypeInfo.inferScalarType(R.getVPSingleValue()));
965 switch (Opcode) {
967 return Folder.FoldSelect(Ops[0], Ops[1],
970 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
972 case Instruction::Select:
973 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
974 case Instruction::ICmp:
975 case Instruction::FCmp:
976 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
977 Ops[1]);
978 case Instruction::GetElementPtr: {
979 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
980 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
981 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
982 RFlags.getGEPNoWrapFlags());
983 }
986 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
987 Ops[1],
988 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
989 // An extract of a live-in is an extract of a broadcast, so return the
990 // broadcasted element.
991 case Instruction::ExtractElement:
992 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
993 return Ops[0];
994 }
995 return nullptr;
996}
997
998/// Try to simplify recipe \p R.
999static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
1000 VPlan *Plan = R.getParent()->getPlan();
1001
1002 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1003 if (!Def)
1004 return;
1005
1006 // Simplification of live-in IR values for SingleDef recipes using
1007 // InstSimplifyFolder.
1011 const DataLayout &DL =
1013 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1014 TypeInfo);
1015 if (V)
1016 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1017 return V;
1018 })
1019 .Default([](auto *) { return false; }))
1020 return;
1021
1022 // Fold PredPHI LiveIn -> LiveIn.
1023 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1024 VPValue *Op = PredPHI->getOperand(0);
1025 if (Op->isLiveIn())
1026 PredPHI->replaceAllUsesWith(Op);
1027 }
1028
1029 VPValue *A;
1030 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1031 Type *TruncTy = TypeInfo.inferScalarType(Def);
1032 Type *ATy = TypeInfo.inferScalarType(A);
1033 if (TruncTy == ATy) {
1034 Def->replaceAllUsesWith(A);
1035 } else {
1036 // Don't replace a scalarizing recipe with a widened cast.
1037 if (isa<VPReplicateRecipe>(Def))
1038 return;
1039 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1040
1041 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1042 ? Instruction::SExt
1043 : Instruction::ZExt;
1044 auto *VPC =
1045 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1046 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1047 // UnderlyingExt has distinct return type, used to retain legacy cost.
1048 VPC->setUnderlyingValue(UnderlyingExt);
1049 }
1050 VPC->insertBefore(&R);
1051 Def->replaceAllUsesWith(VPC);
1052 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1053 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1054 VPC->insertBefore(&R);
1055 Def->replaceAllUsesWith(VPC);
1056 }
1057 }
1058#ifndef NDEBUG
1059 // Verify that the cached type info is for both A and its users is still
1060 // accurate by comparing it to freshly computed types.
1061 VPTypeAnalysis TypeInfo2(*Plan);
1062 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1063 for (VPUser *U : A->users()) {
1064 auto *R = cast<VPRecipeBase>(U);
1065 for (VPValue *VPV : R->definedValues())
1066 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1067 }
1068#endif
1069 }
1070
1071 // Simplify (X && Y) || (X && !Y) -> X.
1072 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1073 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1074 // recipes to be visited during simplification.
1075 VPValue *X, *Y, *Z;
1076 if (match(Def,
1079 Def->replaceAllUsesWith(X);
1080 Def->eraseFromParent();
1081 return;
1082 }
1083
1084 // x | 1 -> 1
1085 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1086 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1087
1088 // x | 0 -> x
1089 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1090 return Def->replaceAllUsesWith(X);
1091
1092 // x & 0 -> 0
1093 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1094 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1095
1096 // x && false -> false
1097 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1098 return Def->replaceAllUsesWith(Def->getOperand(1));
1099
1100 // (x && y) || (x && z) -> x && (y || z)
1101 VPBuilder Builder(Def);
1104 // Simplify only if one of the operands has one use to avoid creating an
1105 // extra recipe.
1106 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1107 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1108 return Def->replaceAllUsesWith(
1109 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1110
1111 // x && !x -> 0
1113 return Def->replaceAllUsesWith(Plan->getFalse());
1114
1115 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1116 return Def->replaceAllUsesWith(X);
1117
1118 // select !c, x, y -> select c, y, x
1119 VPValue *C;
1120 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1121 Def->setOperand(0, C);
1122 Def->setOperand(1, Y);
1123 Def->setOperand(2, X);
1124 return;
1125 }
1126
1127 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1128 // tail folding it is likely that x is a header mask and can be simplified
1129 // further.
1131 m_VPValue(Z))) &&
1132 X->hasMoreThanOneUniqueUser())
1133 return Def->replaceAllUsesWith(
1134 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1135
1136 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1137 return Def->replaceAllUsesWith(A);
1138
1139 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1140 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1141 : R.getOperand(0));
1142
1143 if (match(Def, m_Not(m_VPValue(A)))) {
1144 if (match(A, m_Not(m_VPValue(A))))
1145 return Def->replaceAllUsesWith(A);
1146
1147 // Try to fold Not into compares by adjusting the predicate in-place.
1148 CmpPredicate Pred;
1149 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1150 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1151 if (all_of(Cmp->users(),
1153 m_Not(m_Specific(Cmp)),
1154 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1155 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1156 for (VPUser *U : to_vector(Cmp->users())) {
1157 auto *R = cast<VPSingleDefRecipe>(U);
1158 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1159 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1160 R->setOperand(1, Y);
1161 R->setOperand(2, X);
1162 } else {
1163 // not (cmp pred) -> cmp inv_pred
1164 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1165 R->replaceAllUsesWith(Cmp);
1166 }
1167 }
1168 // If Cmp doesn't have a debug location, use the one from the negation,
1169 // to preserve the location.
1170 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1171 Cmp->setDebugLoc(R.getDebugLoc());
1172 }
1173 }
1174 }
1175
1176 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1177 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1178 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1179 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1180 TypeInfo.inferScalarType(Def))
1181 return Def->replaceAllUsesWith(Def->getOperand(1));
1182
1184 m_One()))) {
1185 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1186 if (TypeInfo.inferScalarType(X) != WideStepTy)
1187 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1188 Def->replaceAllUsesWith(X);
1189 return;
1190 }
1191
1192 // For i1 vp.merges produced by AnyOf reductions:
1193 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1195 m_VPValue(X), m_VPValue())) &&
1197 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1198 Def->setOperand(1, Def->getOperand(0));
1199 Def->setOperand(0, Y);
1200 return;
1201 }
1202
1203 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1204 if (Phi->getOperand(0) == Phi->getOperand(1))
1205 Def->replaceAllUsesWith(Phi->getOperand(0));
1206 return;
1207 }
1208
1209 // Look through ExtractLastElement (BuildVector ....).
1211 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1212 Def->replaceAllUsesWith(
1213 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1214 return;
1215 }
1216
1217 // Look through ExtractPenultimateElement (BuildVector ....).
1219 m_BuildVector()))) {
1220 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1221 Def->replaceAllUsesWith(
1222 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1223 return;
1224 }
1225
1226 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1227 if (Phi->getNumOperands() == 1)
1228 Phi->replaceAllUsesWith(Phi->getOperand(0));
1229 return;
1230 }
1231
1232 // Some simplifications can only be applied after unrolling. Perform them
1233 // below.
1234 if (!Plan->isUnrolled())
1235 return;
1236
1237 // VPVectorPointer for part 0 can be replaced by their start pointer.
1238 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1239 if (VecPtr->isFirstPart()) {
1240 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1241 return;
1242 }
1243 }
1244
1245 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1246 // the first lane is demanded.
1247 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1248 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1249 Steps->replaceAllUsesWith(Steps->getOperand(0));
1250 return;
1251 }
1252 }
1253 // Simplify redundant ReductionStartVector recipes after unrolling.
1254 VPValue *StartV;
1256 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1257 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1258 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1259 return PhiR && PhiR->isInLoop();
1260 });
1261 return;
1262 }
1263
1265 Def->replaceAllUsesWith(A);
1266 return;
1267 }
1268
1269 if (match(Def,
1273 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1274 all_of(A->users(),
1275 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1276 return Def->replaceAllUsesWith(A);
1277 }
1278}
1279
1282 Plan.getEntry());
1283 VPTypeAnalysis TypeInfo(Plan);
1285 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1286 simplifyRecipe(R, TypeInfo);
1287 }
1288 }
1289}
1290
1292 if (Plan.hasScalarVFOnly())
1293 return;
1294
1295 // Try to narrow wide and replicating recipes to single scalar recipes,
1296 // based on VPlan analysis. Only process blocks in the loop region for now,
1297 // without traversing into nested regions, as recipes in replicate regions
1298 // cannot be converted yet.
1301 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1303 continue;
1304 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1305 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1306 continue;
1307
1308 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1309 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1310 vputils::isSingleScalar(RepR->getOperand(1))) {
1311 auto *Clone = new VPReplicateRecipe(
1312 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1313 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1314 Clone->insertBefore(RepOrWidenR);
1316 {Clone->getOperand(0)});
1317 Ext->insertBefore(Clone);
1318 Clone->setOperand(0, Ext);
1319 RepR->eraseFromParent();
1320 continue;
1321 }
1322
1323 // Skip recipes that aren't single scalars or don't have only their
1324 // scalar results used. In the latter case, we would introduce extra
1325 // broadcasts.
1326 if (!vputils::isSingleScalar(RepOrWidenR) ||
1327 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1328 return U->usesScalars(RepOrWidenR) ||
1329 match(cast<VPRecipeBase>(U),
1330 m_ExtractLastElement(m_VPValue()));
1331 }))
1332 continue;
1333
1334 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1335 RepOrWidenR->operands(),
1336 true /*IsSingleScalar*/);
1337 Clone->insertBefore(RepOrWidenR);
1338 RepOrWidenR->replaceAllUsesWith(Clone);
1339 }
1340 }
1341}
1342
1343/// Try to see if all of \p Blend's masks share a common value logically and'ed
1344/// and remove it from the masks.
1346 if (Blend->isNormalized())
1347 return;
1348 VPValue *CommonEdgeMask;
1349 if (!match(Blend->getMask(0),
1350 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1351 return;
1352 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1353 if (!match(Blend->getMask(I),
1354 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1355 return;
1356 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1357 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1358}
1359
1360/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1361/// to make sure the masks are simplified.
1362static void simplifyBlends(VPlan &Plan) {
1365 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1366 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1367 if (!Blend)
1368 continue;
1369
1370 removeCommonBlendMask(Blend);
1371
1372 // Try to remove redundant blend recipes.
1373 SmallPtrSet<VPValue *, 4> UniqueValues;
1374 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1375 UniqueValues.insert(Blend->getIncomingValue(0));
1376 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1377 if (!match(Blend->getMask(I), m_False()))
1378 UniqueValues.insert(Blend->getIncomingValue(I));
1379
1380 if (UniqueValues.size() == 1) {
1381 Blend->replaceAllUsesWith(*UniqueValues.begin());
1382 Blend->eraseFromParent();
1383 continue;
1384 }
1385
1386 if (Blend->isNormalized())
1387 continue;
1388
1389 // Normalize the blend so its first incoming value is used as the initial
1390 // value with the others blended into it.
1391
1392 unsigned StartIndex = 0;
1393 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1394 // If a value's mask is used only by the blend then is can be deadcoded.
1395 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1396 // that's used by multiple blends where it can be removed from them all.
1397 VPValue *Mask = Blend->getMask(I);
1398 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1399 StartIndex = I;
1400 break;
1401 }
1402 }
1403
1404 SmallVector<VPValue *, 4> OperandsWithMask;
1405 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1406
1407 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1408 if (I == StartIndex)
1409 continue;
1410 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1411 OperandsWithMask.push_back(Blend->getMask(I));
1412 }
1413
1414 auto *NewBlend =
1415 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1416 OperandsWithMask, Blend->getDebugLoc());
1417 NewBlend->insertBefore(&R);
1418
1419 VPValue *DeadMask = Blend->getMask(StartIndex);
1420 Blend->replaceAllUsesWith(NewBlend);
1421 Blend->eraseFromParent();
1423
1424 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1425 VPValue *NewMask;
1426 if (NewBlend->getNumOperands() == 3 &&
1427 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1428 VPValue *Inc0 = NewBlend->getOperand(0);
1429 VPValue *Inc1 = NewBlend->getOperand(1);
1430 VPValue *OldMask = NewBlend->getOperand(2);
1431 NewBlend->setOperand(0, Inc1);
1432 NewBlend->setOperand(1, Inc0);
1433 NewBlend->setOperand(2, NewMask);
1434 if (OldMask->getNumUsers() == 0)
1435 cast<VPInstruction>(OldMask)->eraseFromParent();
1436 }
1437 }
1438 }
1439}
1440
1441/// Optimize the width of vector induction variables in \p Plan based on a known
1442/// constant Trip Count, \p BestVF and \p BestUF.
1444 ElementCount BestVF,
1445 unsigned BestUF) {
1446 // Only proceed if we have not completely removed the vector region.
1447 if (!Plan.getVectorLoopRegion())
1448 return false;
1449
1450 if (!Plan.getTripCount()->isLiveIn())
1451 return false;
1454 if (!TC || !BestVF.isFixed())
1455 return false;
1456
1457 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1458 // and UF. Returns at least 8.
1459 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1460 APInt AlignedTC =
1463 APInt MaxVal = AlignedTC - 1;
1464 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1465 };
1466 unsigned NewBitWidth =
1467 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1468
1469 LLVMContext &Ctx = Plan.getContext();
1470 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1471
1472 bool MadeChange = false;
1473
1474 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1475 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1476 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1477
1478 // Currently only handle canonical IVs as it is trivial to replace the start
1479 // and stop values, and we currently only perform the optimization when the
1480 // IV has a single use.
1481 if (!WideIV || !WideIV->isCanonical() ||
1482 WideIV->hasMoreThanOneUniqueUser() ||
1483 NewIVTy == WideIV->getScalarType())
1484 continue;
1485
1486 // Currently only handle cases where the single user is a header-mask
1487 // comparison with the backedge-taken-count.
1488 if (!match(*WideIV->user_begin(),
1489 m_ICmp(m_Specific(WideIV),
1492 continue;
1493
1494 // Update IV operands and comparison bound to use new narrower type.
1495 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1496 WideIV->setStartValue(NewStart);
1497 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1498 WideIV->setStepValue(NewStep);
1499
1500 auto *NewBTC = new VPWidenCastRecipe(
1501 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1502 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1503 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1504 Cmp->setOperand(1, NewBTC);
1505
1506 MadeChange = true;
1507 }
1508
1509 return MadeChange;
1510}
1511
1512/// Return true if \p Cond is known to be true for given \p BestVF and \p
1513/// BestUF.
1515 ElementCount BestVF, unsigned BestUF,
1516 ScalarEvolution &SE) {
1518 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1519 &SE](VPValue *C) {
1520 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1521 });
1522
1523 auto *CanIV = Plan.getCanonicalIV();
1525 m_Specific(CanIV->getBackedgeValue()),
1526 m_Specific(&Plan.getVectorTripCount()))))
1527 return false;
1528
1529 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1530 // count is not conveniently available as SCEV so far, so we compare directly
1531 // against the original trip count. This is stricter than necessary, as we
1532 // will only return true if the trip count == vector trip count.
1533 const SCEV *VectorTripCount =
1535 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1536 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1537 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1538 "Trip count SCEV must be computable");
1539 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1540 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1541 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1542}
1543
1544/// Try to replace multiple active lane masks used for control flow with
1545/// a single, wide active lane mask instruction followed by multiple
1546/// extract subvector intrinsics. This applies to the active lane mask
1547/// instructions both in the loop and in the preheader.
1548/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1549/// new extracts from the first active lane mask, which has it's last
1550/// operand (multiplier) set to UF.
1552 unsigned UF) {
1553 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1554 return false;
1555
1556 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1557 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1558 auto *Term = &ExitingVPBB->back();
1559
1560 using namespace llvm::VPlanPatternMatch;
1562 m_VPValue(), m_VPValue(), m_VPValue())))))
1563 return false;
1564
1565 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1566 LLVMContext &Ctx = Plan.getContext();
1567
1568 auto ExtractFromALM = [&](VPInstruction *ALM,
1569 SmallVectorImpl<VPValue *> &Extracts) {
1570 DebugLoc DL = ALM->getDebugLoc();
1571 for (unsigned Part = 0; Part < UF; ++Part) {
1573 Ops.append({ALM, Plan.getOrAddLiveIn(
1574 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1575 VF.getKnownMinValue() * Part))});
1576 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1578 Extracts[Part] = Ext;
1579 Ext->insertAfter(ALM);
1580 }
1581 };
1582
1583 // Create a list of each active lane mask phi, ordered by unroll part.
1585 for (VPRecipeBase &R : Header->phis()) {
1587 if (!Phi)
1588 continue;
1589 VPValue *Index = nullptr;
1590 match(Phi->getBackedgeValue(),
1592 assert(Index && "Expected index from ActiveLaneMask instruction");
1593
1594 uint64_t Part;
1595 if (match(Index,
1597 m_VPValue(), m_ConstantInt(Part))))
1598 Phis[Part] = Phi;
1599 else
1600 // Anything other than a CanonicalIVIncrementForPart is part 0
1601 Phis[0] = Phi;
1602 }
1603
1604 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1605 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1606
1607 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1608 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1609
1610 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1611 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1612 "Expected incoming values of Phi to be ActiveLaneMasks");
1613
1614 // When using wide lane masks, the return type of the get.active.lane.mask
1615 // intrinsic is VF x UF (last operand).
1616 VPValue *ALMMultiplier =
1617 Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
1618 EntryALM->setOperand(2, ALMMultiplier);
1619 LoopALM->setOperand(2, ALMMultiplier);
1620
1621 // Create UF x extract vectors and insert into preheader.
1622 SmallVector<VPValue *> EntryExtracts(UF);
1623 ExtractFromALM(EntryALM, EntryExtracts);
1624
1625 // Create UF x extract vectors and insert before the loop compare & branch,
1626 // updating the compare to use the first extract.
1627 SmallVector<VPValue *> LoopExtracts(UF);
1628 ExtractFromALM(LoopALM, LoopExtracts);
1629 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1630 Not->setOperand(0, LoopExtracts[0]);
1631
1632 // Update the incoming values of active lane mask phis.
1633 for (unsigned Part = 0; Part < UF; ++Part) {
1634 Phis[Part]->setStartValue(EntryExtracts[Part]);
1635 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1636 }
1637
1638 return true;
1639}
1640
1641/// Try to simplify the branch condition of \p Plan. This may restrict the
1642/// resulting plan to \p BestVF and \p BestUF.
1644 unsigned BestUF,
1646 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1647 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1648 auto *Term = &ExitingVPBB->back();
1649 VPValue *Cond;
1650 ScalarEvolution &SE = *PSE.getSE();
1651 if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1653 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1654 // Try to simplify the branch condition if TC <= VF * UF when the latch
1655 // terminator is BranchOnCount or BranchOnCond where the input is
1656 // Not(ActiveLaneMask).
1657 const SCEV *TripCount =
1659 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1660 "Trip count SCEV must be computable");
1661 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1662 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1663 if (TripCount->isZero() ||
1664 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1665 return false;
1666 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1667 // For BranchOnCond, check if we can prove the condition to be true using VF
1668 // and UF.
1669 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1670 return false;
1671 } else {
1672 return false;
1673 }
1674
1675 // The vector loop region only executes once. If possible, completely remove
1676 // the region, otherwise replace the terminator controlling the latch with
1677 // (BranchOnCond true).
1678 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1679 // support for other non-canonical widen induction recipes (e.g.,
1680 // VPWidenPointerInductionRecipe).
1681 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1682 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1683 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1684 return R->isCanonical();
1685 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1686 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1687 })) {
1688 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1689 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1690 VPBuilder Builder(Plan.getVectorPreheader());
1691 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1692 R->getScalarType());
1693 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1694 HeaderR.eraseFromParent();
1695 continue;
1696 }
1697 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1698 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1699 HeaderR.eraseFromParent();
1700 }
1701
1702 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1703 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1704 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1705 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1706
1707 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1708 B->setParent(nullptr);
1709
1710 VPBlockUtils::connectBlocks(Preheader, Header);
1711 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1713 } else {
1714 // The vector region contains header phis for which we cannot remove the
1715 // loop region yet.
1716 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1717 Term->getDebugLoc());
1718 ExitingVPBB->appendRecipe(BOC);
1719 }
1720
1721 Term->eraseFromParent();
1722
1723 return true;
1724}
1725
1727 unsigned BestUF,
1729 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1730 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1731
1732 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1733 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1734 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1735
1736 if (MadeChange) {
1737 Plan.setVF(BestVF);
1738 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1739 }
1740 // TODO: Further simplifications are possible
1741 // 1. Replace inductions with constants.
1742 // 2. Replace vector loop region with VPBasicBlock.
1743}
1744
1745/// Sink users of \p FOR after the recipe defining the previous value \p
1746/// Previous of the recurrence. \returns true if all users of \p FOR could be
1747/// re-arranged as needed or false if it is not possible.
1748static bool
1750 VPRecipeBase *Previous,
1751 VPDominatorTree &VPDT) {
1752 // Collect recipes that need sinking.
1755 Seen.insert(Previous);
1756 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1757 // The previous value must not depend on the users of the recurrence phi. In
1758 // that case, FOR is not a fixed order recurrence.
1759 if (SinkCandidate == Previous)
1760 return false;
1761
1762 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1763 !Seen.insert(SinkCandidate).second ||
1764 VPDT.properlyDominates(Previous, SinkCandidate))
1765 return true;
1766
1767 if (SinkCandidate->mayHaveSideEffects())
1768 return false;
1769
1770 WorkList.push_back(SinkCandidate);
1771 return true;
1772 };
1773
1774 // Recursively sink users of FOR after Previous.
1775 WorkList.push_back(FOR);
1776 for (unsigned I = 0; I != WorkList.size(); ++I) {
1777 VPRecipeBase *Current = WorkList[I];
1778 assert(Current->getNumDefinedValues() == 1 &&
1779 "only recipes with a single defined value expected");
1780
1781 for (VPUser *User : Current->getVPSingleValue()->users()) {
1782 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1783 return false;
1784 }
1785 }
1786
1787 // Keep recipes to sink ordered by dominance so earlier instructions are
1788 // processed first.
1789 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1790 return VPDT.properlyDominates(A, B);
1791 });
1792
1793 for (VPRecipeBase *SinkCandidate : WorkList) {
1794 if (SinkCandidate == FOR)
1795 continue;
1796
1797 SinkCandidate->moveAfter(Previous);
1798 Previous = SinkCandidate;
1799 }
1800 return true;
1801}
1802
1803/// Try to hoist \p Previous and its operands before all users of \p FOR.
1805 VPRecipeBase *Previous,
1806 VPDominatorTree &VPDT) {
1807 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1808 return false;
1809
1810 // Collect recipes that need hoisting.
1811 SmallVector<VPRecipeBase *> HoistCandidates;
1813 VPRecipeBase *HoistPoint = nullptr;
1814 // Find the closest hoist point by looking at all users of FOR and selecting
1815 // the recipe dominating all other users.
1816 for (VPUser *U : FOR->users()) {
1817 auto *R = cast<VPRecipeBase>(U);
1818 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1819 HoistPoint = R;
1820 }
1821 assert(all_of(FOR->users(),
1822 [&VPDT, HoistPoint](VPUser *U) {
1823 auto *R = cast<VPRecipeBase>(U);
1824 return HoistPoint == R ||
1825 VPDT.properlyDominates(HoistPoint, R);
1826 }) &&
1827 "HoistPoint must dominate all users of FOR");
1828
1829 auto NeedsHoisting = [HoistPoint, &VPDT,
1830 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1831 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1832 if (!HoistCandidate)
1833 return nullptr;
1834 VPRegionBlock *EnclosingLoopRegion =
1835 HoistCandidate->getParent()->getEnclosingLoopRegion();
1836 assert((!HoistCandidate->getParent()->getParent() ||
1837 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1838 "CFG in VPlan should still be flat, without replicate regions");
1839 // Hoist candidate was already visited, no need to hoist.
1840 if (!Visited.insert(HoistCandidate).second)
1841 return nullptr;
1842
1843 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1844 // hoisting.
1845 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1846 return nullptr;
1847
1848 // If we reached a recipe that dominates HoistPoint, we don't need to
1849 // hoist the recipe.
1850 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1851 return nullptr;
1852 return HoistCandidate;
1853 };
1854 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1855 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1856 // associated dependencies.
1857 return !HoistCandidate->mayHaveSideEffects();
1858 };
1859
1860 if (!NeedsHoisting(Previous->getVPSingleValue()))
1861 return true;
1862
1863 // Recursively try to hoist Previous and its operands before all users of FOR.
1864 HoistCandidates.push_back(Previous);
1865
1866 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1867 VPRecipeBase *Current = HoistCandidates[I];
1868 assert(Current->getNumDefinedValues() == 1 &&
1869 "only recipes with a single defined value expected");
1870 if (!CanHoist(Current))
1871 return false;
1872
1873 for (VPValue *Op : Current->operands()) {
1874 // If we reach FOR, it means the original Previous depends on some other
1875 // recurrence that in turn depends on FOR. If that is the case, we would
1876 // also need to hoist recipes involving the other FOR, which may break
1877 // dependencies.
1878 if (Op == FOR)
1879 return false;
1880
1881 if (auto *R = NeedsHoisting(Op))
1882 HoistCandidates.push_back(R);
1883 }
1884 }
1885
1886 // Order recipes to hoist by dominance so earlier instructions are processed
1887 // first.
1888 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1889 return VPDT.properlyDominates(A, B);
1890 });
1891
1892 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1893 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1894 HoistPoint->getIterator());
1895 }
1896
1897 return true;
1898}
1899
1901 VPBuilder &LoopBuilder) {
1902 VPDominatorTree VPDT;
1903 VPDT.recalculate(Plan);
1904
1906 for (VPRecipeBase &R :
1909 RecurrencePhis.push_back(FOR);
1910
1911 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1913 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1914 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1915 // to terminate.
1916 while (auto *PrevPhi =
1918 assert(PrevPhi->getParent() == FOR->getParent());
1919 assert(SeenPhis.insert(PrevPhi).second);
1920 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1921 }
1922
1923 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1924 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1925 return false;
1926
1927 // Introduce a recipe to combine the incoming and previous values of a
1928 // fixed-order recurrence.
1929 VPBasicBlock *InsertBlock = Previous->getParent();
1930 if (isa<VPHeaderPHIRecipe>(Previous))
1931 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1932 else
1933 LoopBuilder.setInsertPoint(InsertBlock,
1934 std::next(Previous->getIterator()));
1935
1936 auto *RecurSplice =
1938 {FOR, FOR->getBackedgeValue()});
1939
1940 FOR->replaceAllUsesWith(RecurSplice);
1941 // Set the first operand of RecurSplice to FOR again, after replacing
1942 // all users.
1943 RecurSplice->setOperand(0, FOR);
1944 }
1945 return true;
1946}
1947
1949 for (VPRecipeBase &R :
1951 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1952 if (!PhiR)
1953 continue;
1954 RecurKind RK = PhiR->getRecurrenceKind();
1955 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
1957 continue;
1958
1959 for (VPUser *U : collectUsersRecursively(PhiR))
1960 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1961 RecWithFlags->dropPoisonGeneratingFlags();
1962 }
1963 }
1964}
1965
1966namespace {
1967struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1968 static bool isSentinel(const VPSingleDefRecipe *Def) {
1969 return Def == getEmptyKey() || Def == getTombstoneKey();
1970 }
1971
1972 /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1973 /// Returns an optional pair, where the first element indicates whether it is
1974 /// an intrinsic ID.
1975 static std::optional<std::pair<bool, unsigned>>
1976 getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1977 return TypeSwitch<const VPSingleDefRecipe *,
1978 std::optional<std::pair<bool, unsigned>>>(R)
1981 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1982 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1983 return std::make_pair(true, I->getVectorIntrinsicID());
1984 })
1985 .Default([](auto *) { return std::nullopt; });
1986 }
1987
1988 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
1989 /// return that source element type.
1990 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
1991 // All VPInstructions that lower to GEPs must have the i8 source element
1992 // type (as they are PtrAdds), so we omit it.
1993 return TypeSwitch<const VPSingleDefRecipe *, Type *>(R)
1994 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
1995 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
1996 return GEP->getSourceElementType();
1997 return nullptr;
1998 })
1999 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2000 [](auto *I) { return I->getSourceElementType(); })
2001 .Default([](auto *) { return nullptr; });
2002 }
2003
2004 /// Returns true if recipe \p Def can be safely handed for CSE.
2005 static bool canHandle(const VPSingleDefRecipe *Def) {
2006 // We can extend the list of handled recipes in the future,
2007 // provided we account for the data embedded in them while checking for
2008 // equality or hashing. We assign VPVectorEndPointerRecipe the GEP opcode,
2009 // as it is essentially a GEP with different semantics.
2010 auto C = isa<VPVectorPointerRecipe>(Def)
2011 ? std::make_pair(false, Instruction::GetElementPtr)
2012 : getOpcodeOrIntrinsicID(Def);
2013
2014 // The issue with (Insert|Extract)Value is that the index of the
2015 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2016 // VPlan.
2017 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2018 C->second == Instruction::ExtractValue)))
2019 return false;
2020
2021 // During CSE, we can only handle recipes that don't read from memory: if
2022 // they read from memory, there could be an intervening write to memory
2023 // before the next instance is CSE'd, leading to an incorrect result.
2024 return !Def->mayReadFromMemory();
2025 }
2026
2027 /// Hash the underlying data of \p Def.
2028 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2029 const VPlan *Plan = Def->getParent()->getPlan();
2030 VPTypeAnalysis TypeInfo(*Plan);
2031 hash_code Result = hash_combine(
2032 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2033 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2035 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2036 if (RFlags->hasPredicate())
2037 return hash_combine(Result, RFlags->getPredicate());
2038 return Result;
2039 }
2040
2041 /// Check equality of underlying data of \p L and \p R.
2042 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2043 if (isSentinel(L) || isSentinel(R))
2044 return L == R;
2045 if (L->getVPDefID() != R->getVPDefID() ||
2046 getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
2047 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2049 !equal(L->operands(), R->operands()))
2050 return false;
2051 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2052 if (LFlags->hasPredicate() &&
2053 LFlags->getPredicate() !=
2054 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2055 return false;
2056 const VPlan *Plan = L->getParent()->getPlan();
2057 VPTypeAnalysis TypeInfo(*Plan);
2058 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2059 }
2060};
2061} // end anonymous namespace
2062
2063/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2064/// Plan.
2066 VPDominatorTree VPDT(Plan);
2068
2070 vp_depth_first_deep(Plan.getEntry()))) {
2071 for (VPRecipeBase &R : *VPBB) {
2072 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2073 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2074 continue;
2075 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2076 // V must dominate Def for a valid replacement.
2077 if (!VPDT.dominates(V->getParent(), VPBB))
2078 continue;
2079 // Only keep flags present on both V and Def.
2080 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2081 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2082 Def->replaceAllUsesWith(V);
2083 continue;
2084 }
2085 CSEMap[Def] = Def;
2086 }
2087 }
2088}
2089
2090/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2091static void licm(VPlan &Plan) {
2092 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2093
2094 // Return true if we do not know how to (mechanically) hoist a given recipe
2095 // out of a loop region. Does not address legality concerns such as aliasing
2096 // or speculation safety.
2097 auto CannotHoistRecipe = [](VPRecipeBase &R) {
2098 // Allocas cannot be hoisted.
2099 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2100 return RepR && RepR->getOpcode() == Instruction::Alloca;
2101 };
2102
2103 // Hoist any loop invariant recipes from the vector loop region to the
2104 // preheader. Preform a shallow traversal of the vector loop region, to
2105 // exclude recipes in replicate regions.
2106 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2108 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2109 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2110 if (CannotHoistRecipe(R))
2111 continue;
2112 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
2113 // their memory location is not modified in the vector loop.
2114 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
2115 any_of(R.operands(), [](VPValue *Op) {
2116 return !Op->isDefinedOutsideLoopRegions();
2117 }))
2118 continue;
2119 R.moveBefore(*Preheader, Preheader->end());
2120 }
2121 }
2122}
2123
2125 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2126 if (Plan.hasScalarVFOnly())
2127 return;
2128 // Keep track of created truncates, so they can be re-used. Note that we
2129 // cannot use RAUW after creating a new truncate, as this would could make
2130 // other uses have different types for their operands, making them invalidly
2131 // typed.
2133 VPTypeAnalysis TypeInfo(Plan);
2134 VPBasicBlock *PH = Plan.getVectorPreheader();
2137 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2140 &R))
2141 continue;
2142
2143 VPValue *ResultVPV = R.getVPSingleValue();
2144 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2145 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2146 if (!NewResSizeInBits)
2147 continue;
2148
2149 // If the value wasn't vectorized, we must maintain the original scalar
2150 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2151 // skip casts which do not need to be handled explicitly here, as
2152 // redundant casts will be removed during recipe simplification.
2154 continue;
2155
2156 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2157 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2158 assert(OldResTy->isIntegerTy() && "only integer types supported");
2159 (void)OldResSizeInBits;
2160
2161 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2162
2163 // Any wrapping introduced by shrinking this operation shouldn't be
2164 // considered undefined behavior. So, we can't unconditionally copy
2165 // arithmetic wrapping flags to VPW.
2166 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2167 VPW->dropPoisonGeneratingFlags();
2168
2169 if (OldResSizeInBits != NewResSizeInBits &&
2170 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2171 // Extend result to original width.
2172 auto *Ext =
2173 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2174 Ext->insertAfter(&R);
2175 ResultVPV->replaceAllUsesWith(Ext);
2176 Ext->setOperand(0, ResultVPV);
2177 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2178 } else {
2179 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2180 "Only ICmps should not need extending the result.");
2181 }
2182
2183 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2185 continue;
2186
2187 // Shrink operands by introducing truncates as needed.
2188 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2189 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2190 auto *Op = R.getOperand(Idx);
2191 unsigned OpSizeInBits =
2193 if (OpSizeInBits == NewResSizeInBits)
2194 continue;
2195 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2196 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2197 VPWidenCastRecipe *NewOp =
2198 IterIsEmpty
2199 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy,
2200 VPIRFlags::TruncFlagsTy(false, false))
2201 : ProcessedIter->second;
2202 R.setOperand(Idx, NewOp);
2203 if (!IterIsEmpty)
2204 continue;
2205 ProcessedIter->second = NewOp;
2206 if (!Op->isLiveIn()) {
2207 NewOp->insertBefore(&R);
2208 } else {
2209 PH->appendRecipe(NewOp);
2210 }
2211 }
2212
2213 }
2214 }
2215}
2216
2220 VPValue *Cond;
2221 // Skip blocks that are not terminated by BranchOnCond.
2222 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2223 continue;
2224
2225 assert(VPBB->getNumSuccessors() == 2 &&
2226 "Two successors expected for BranchOnCond");
2227 unsigned RemovedIdx;
2228 if (match(Cond, m_True()))
2229 RemovedIdx = 1;
2230 else if (match(Cond, m_False()))
2231 RemovedIdx = 0;
2232 else
2233 continue;
2234
2235 VPBasicBlock *RemovedSucc =
2236 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2237 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2238 "There must be a single edge between VPBB and its successor");
2239 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2240 // these recipes.
2241 for (VPRecipeBase &R : RemovedSucc->phis())
2242 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2243
2244 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2245 // automatically on VPlan destruction if it becomes unreachable.
2246 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2247 VPBB->back().eraseFromParent();
2248 }
2249}
2250
2269
2270// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2271// the loop terminator with a branch-on-cond recipe with the negated
2272// active-lane-mask as operand. Note that this turns the loop into an
2273// uncountable one. Only the existing terminator is replaced, all other existing
2274// recipes/users remain unchanged, except for poison-generating flags being
2275// dropped from the canonical IV increment. Return the created
2276// VPActiveLaneMaskPHIRecipe.
2277//
2278// The function uses the following definitions:
2279//
2280// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2281// calculate-trip-count-minus-VF (original TC) : original TC
2282// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2283// CanonicalIVPhi : CanonicalIVIncrement
2284// %StartV is the canonical induction start value.
2285//
2286// The function adds the following recipes:
2287//
2288// vector.ph:
2289// %TripCount = calculate-trip-count-minus-VF (original TC)
2290// [if DataWithControlFlowWithoutRuntimeCheck]
2291// %EntryInc = canonical-iv-increment-for-part %StartV
2292// %EntryALM = active-lane-mask %EntryInc, %TripCount
2293//
2294// vector.body:
2295// ...
2296// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2297// ...
2298// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2299// %ALM = active-lane-mask %InLoopInc, TripCount
2300// %Negated = Not %ALM
2301// branch-on-cond %Negated
2302//
2305 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2306 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2307 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2308 VPValue *StartV = CanonicalIVPHI->getStartValue();
2309
2310 auto *CanonicalIVIncrement =
2311 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2312 // TODO: Check if dropping the flags is needed if
2313 // !DataAndControlFlowWithoutRuntimeCheck.
2314 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2315 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2316 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2317 // we have to take unrolling into account. Each part needs to start at
2318 // Part * VF
2319 auto *VecPreheader = Plan.getVectorPreheader();
2320 VPBuilder Builder(VecPreheader);
2321
2322 // Create the ActiveLaneMask instruction using the correct start values.
2323 VPValue *TC = Plan.getTripCount();
2324
2325 VPValue *TripCount, *IncrementValue;
2327 // When the loop is guarded by a runtime overflow check for the loop
2328 // induction variable increment by VF, we can increment the value before
2329 // the get.active.lane mask and use the unmodified tripcount.
2330 IncrementValue = CanonicalIVIncrement;
2331 TripCount = TC;
2332 } else {
2333 // When avoiding a runtime check, the active.lane.mask inside the loop
2334 // uses a modified trip count and the induction variable increment is
2335 // done after the active.lane.mask intrinsic is called.
2336 IncrementValue = CanonicalIVPHI;
2337 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2338 {TC}, DL);
2339 }
2340 auto *EntryIncrement = Builder.createOverflowingOp(
2341 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2342 "index.part.next");
2343
2344 // Create the active lane mask instruction in the VPlan preheader.
2345 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2346 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2347 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2348 {EntryIncrement, TC, ALMMultiplier}, DL,
2349 "active.lane.mask.entry");
2350
2351 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2352 // preheader ActiveLaneMask instruction.
2353 auto *LaneMaskPhi =
2355 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2356
2357 // Create the active lane mask for the next iteration of the loop before the
2358 // original terminator.
2359 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2360 Builder.setInsertPoint(OriginalTerminator);
2361 auto *InLoopIncrement =
2362 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2363 {IncrementValue}, {false, false}, DL);
2364 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2365 {InLoopIncrement, TripCount, ALMMultiplier},
2366 DL, "active.lane.mask.next");
2367 LaneMaskPhi->addOperand(ALM);
2368
2369 // Replace the original terminator with BranchOnCond. We have to invert the
2370 // mask here because a true condition means jumping to the exit block.
2371 auto *NotMask = Builder.createNot(ALM, DL);
2372 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2373 OriginalTerminator->eraseFromParent();
2374 return LaneMaskPhi;
2375}
2376
2377/// Collect the header mask with the pattern:
2378/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2379/// TODO: Introduce explicit recipe for header-mask instead of searching
2380/// for the header-mask pattern manually.
2382 SmallVector<VPValue *> WideCanonicalIVs;
2383 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2387 "Must have at most one VPWideCanonicalIVRecipe");
2388 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2389 auto *WideCanonicalIV =
2390 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2391 WideCanonicalIVs.push_back(WideCanonicalIV);
2392 }
2393
2394 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2395 // version of the canonical induction.
2396 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2397 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2398 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2399 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2400 WideCanonicalIVs.push_back(WidenOriginalIV);
2401 }
2402
2403 // Walk users of wide canonical IVs and find the single compare of the form
2404 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2405 VPSingleDefRecipe *HeaderMask = nullptr;
2406 for (auto *Wide : WideCanonicalIVs) {
2407 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2408 auto *VPI = dyn_cast<VPInstruction>(U);
2409 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2410 continue;
2411
2412 assert(VPI->getOperand(0) == Wide &&
2413 "WidenCanonicalIV must be the first operand of the compare");
2414 assert(!HeaderMask && "Multiple header masks found?");
2415 HeaderMask = VPI;
2416 }
2417 }
2418 return HeaderMask;
2419}
2420
2422 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2425 UseActiveLaneMaskForControlFlow) &&
2426 "DataAndControlFlowWithoutRuntimeCheck implies "
2427 "UseActiveLaneMaskForControlFlow");
2428
2429 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2431 assert(FoundWidenCanonicalIVUser &&
2432 "Must have widened canonical IV when tail folding!");
2433 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2434 auto *WideCanonicalIV =
2435 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2436 VPSingleDefRecipe *LaneMask;
2437 if (UseActiveLaneMaskForControlFlow) {
2440 } else {
2441 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2442 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2443 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2444 LaneMask =
2445 B.createNaryOp(VPInstruction::ActiveLaneMask,
2446 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2447 nullptr, "active.lane.mask");
2448 }
2449
2450 // Walk users of WideCanonicalIV and replace the header mask of the form
2451 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2452 // removing the old one to ensure there is always only a single header mask.
2453 HeaderMask->replaceAllUsesWith(LaneMask);
2454 HeaderMask->eraseFromParent();
2455}
2456
2457/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2458/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2459/// recipe could be created.
2460/// \p HeaderMask Header Mask.
2461/// \p CurRecipe Recipe to be transform.
2462/// \p TypeInfo VPlan-based type analysis.
2463/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2464/// \p EVL The explicit vector length parameter of vector-predication
2465/// intrinsics.
2467 VPRecipeBase &CurRecipe,
2468 VPTypeAnalysis &TypeInfo,
2469 VPValue &AllOneMask, VPValue &EVL) {
2470 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2471 // header mask.
2472 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2473 assert(OrigMask && "Unmasked recipe when folding tail");
2474 // HeaderMask will be handled using EVL.
2475 VPValue *Mask;
2476 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2477 return Mask;
2478 return HeaderMask == OrigMask ? nullptr : OrigMask;
2479 };
2480
2481 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2482 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2483 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2484 if (!EndPtr)
2485 return Addr;
2486 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2487 "VPVectorEndPointerRecipe with non-VF VF operand?");
2488 assert(
2489 all_of(EndPtr->users(),
2490 [](VPUser *U) {
2491 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2492 }) &&
2493 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2494 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2495 EVLAddr->insertBefore(&CurRecipe);
2496 EVLAddr->setOperand(1, &EVL);
2497 return EVLAddr;
2498 };
2499
2502 VPValue *NewMask = GetNewMask(L->getMask());
2503 VPValue *NewAddr = GetNewAddr(L->getAddr());
2504 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2505 })
2506 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2507 VPValue *NewMask = GetNewMask(S->getMask());
2508 VPValue *NewAddr = GetNewAddr(S->getAddr());
2509 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2510 })
2511 .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2512 VPValue *NewMask = GetNewMask(IR->getMask());
2513 return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2514 })
2515 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2516 VPValue *NewMask = GetNewMask(Red->getCondOp());
2517 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2518 })
2519 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2520 VPValue *LHS, *RHS;
2521 // Transform select with a header mask condition
2522 // select(header_mask, LHS, RHS)
2523 // into vector predication merge.
2524 // vp.merge(all-true, LHS, RHS, EVL)
2525 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2526 m_VPValue(RHS))))
2527 return nullptr;
2528 // Use all true as the condition because this transformation is
2529 // limited to selects whose condition is a header mask.
2530 return new VPWidenIntrinsicRecipe(
2531 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2532 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2533 })
2534 .Default([&](VPRecipeBase *R) { return nullptr; });
2535}
2536
2537/// Replace recipes with their EVL variants.
2539 VPTypeAnalysis TypeInfo(Plan);
2540 VPValue *AllOneMask = Plan.getTrue();
2541 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2542 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2543
2544 assert(all_of(Plan.getVF().users(),
2547 "User of VF that we can't transform to EVL.");
2548 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2550 });
2551
2552 assert(all_of(Plan.getVFxUF().users(),
2553 [&Plan](VPUser *U) {
2554 return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),
2555 m_Specific(&Plan.getVFxUF()))) ||
2556 isa<VPWidenPointerInductionRecipe>(U);
2557 }) &&
2558 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2559 "increment of the canonical induction.");
2560 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2561 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2562 // canonical induction must not be updated.
2564 });
2565
2566 // Defer erasing recipes till the end so that we don't invalidate the
2567 // VPTypeAnalysis cache.
2569
2570 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2571 // contained.
2572 bool ContainsFORs =
2574 if (ContainsFORs) {
2575 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2576 VPValue *MaxEVL = &Plan.getVF();
2577 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2578 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2579 MaxEVL = Builder.createScalarZExtOrTrunc(
2580 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2581 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2582
2583 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2584 VPValue *PrevEVL = Builder.createScalarPhi(
2585 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2586
2589 for (VPRecipeBase &R : *VPBB) {
2590 VPValue *V1, *V2;
2591 if (!match(&R,
2593 m_VPValue(V1), m_VPValue(V2))))
2594 continue;
2595 VPValue *Imm = Plan.getOrAddLiveIn(
2598 Intrinsic::experimental_vp_splice,
2599 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2600 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2601 VPSplice->insertBefore(&R);
2602 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2603 ToErase.push_back(&R);
2604 }
2605 }
2606 }
2607
2608 VPValue *HeaderMask = findHeaderMask(Plan);
2609 if (!HeaderMask)
2610 return;
2611
2612 // Replace header masks with a mask equivalent to predicating by EVL:
2613 //
2614 // icmp ule widen-canonical-iv backedge-taken-count
2615 // ->
2616 // icmp ult step-vector, EVL
2617 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2618 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2619 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2620 VPValue *EVLMask = Builder.createICmp(
2622 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2623 HeaderMask->replaceAllUsesWith(EVLMask);
2624 ToErase.push_back(HeaderMask->getDefiningRecipe());
2625
2626 // Try to optimize header mask recipes away to their EVL variants.
2627 // TODO: Split optimizeMaskToEVL out and move into
2628 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2629 // tryToBuildVPlanWithVPRecipes beforehand.
2630 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2631 auto *CurRecipe = cast<VPRecipeBase>(U);
2632 VPRecipeBase *EVLRecipe =
2633 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2634 if (!EVLRecipe)
2635 continue;
2636
2637 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2638 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2639 "New recipe must define the same number of values as the "
2640 "original.");
2641 EVLRecipe->insertBefore(CurRecipe);
2643 EVLRecipe)) {
2644 for (unsigned I = 0; I < NumDefVal; ++I) {
2645 VPValue *CurVPV = CurRecipe->getVPValue(I);
2646 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2647 }
2648 }
2649 ToErase.push_back(CurRecipe);
2650 }
2651 // Remove dead EVL mask.
2652 if (EVLMask->getNumUsers() == 0)
2653 ToErase.push_back(EVLMask->getDefiningRecipe());
2654
2655 for (VPRecipeBase *R : reverse(ToErase)) {
2656 SmallVector<VPValue *> PossiblyDead(R->operands());
2657 R->eraseFromParent();
2658 for (VPValue *Op : PossiblyDead)
2660 }
2661}
2662
2663/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2664/// replaces all uses except the canonical IV increment of
2665/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2666/// is used only for loop iterations counting after this transformation.
2667///
2668/// The function uses the following definitions:
2669/// %StartV is the canonical induction start value.
2670///
2671/// The function adds the following recipes:
2672///
2673/// vector.ph:
2674/// ...
2675///
2676/// vector.body:
2677/// ...
2678/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2679/// [ %NextEVLIV, %vector.body ]
2680/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2681/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2682/// ...
2683/// %OpEVL = cast i32 %VPEVL to IVSize
2684/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2685/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2686/// ...
2687///
2688/// If MaxSafeElements is provided, the function adds the following recipes:
2689/// vector.ph:
2690/// ...
2691///
2692/// vector.body:
2693/// ...
2694/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2695/// [ %NextEVLIV, %vector.body ]
2696/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2697/// %cmp = cmp ult %AVL, MaxSafeElements
2698/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2699/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2700/// ...
2701/// %OpEVL = cast i32 %VPEVL to IVSize
2702/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2703/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2704/// ...
2705///
2707 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2708 if (Plan.hasScalarVFOnly())
2709 return;
2711
2712 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2713 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2714 VPValue *StartV = CanonicalIVPHI->getStartValue();
2715
2716 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2717 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2718 EVLPhi->insertAfter(CanonicalIVPHI);
2719 VPBuilder Builder(Header, Header->getFirstNonPhi());
2720 // Create the AVL (application vector length), starting from TC -> 0 in steps
2721 // of EVL.
2722 VPPhi *AVLPhi = Builder.createScalarPhi(
2723 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2724 VPValue *AVL = AVLPhi;
2725
2726 if (MaxSafeElements) {
2727 // Support for MaxSafeDist for correct loop emission.
2728 VPValue *AVLSafe =
2729 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2730 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2731 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2732 "safe_avl");
2733 }
2734 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2736
2737 auto *CanonicalIVIncrement =
2738 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2739 Builder.setInsertPoint(CanonicalIVIncrement);
2740 VPValue *OpVPEVL = VPEVL;
2741
2742 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2743 OpVPEVL = Builder.createScalarZExtOrTrunc(
2744 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2745
2746 auto *NextEVLIV = Builder.createOverflowingOp(
2747 Instruction::Add, {OpVPEVL, EVLPhi},
2748 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2749 CanonicalIVIncrement->hasNoSignedWrap()},
2750 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2751 EVLPhi->addOperand(NextEVLIV);
2752
2753 VPValue *NextAVL = Builder.createOverflowingOp(
2754 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2755 DebugLoc::getCompilerGenerated(), "avl.next");
2756 AVLPhi->addOperand(NextAVL);
2757
2758 transformRecipestoEVLRecipes(Plan, *VPEVL);
2759
2760 // Replace all uses of VPCanonicalIVPHIRecipe by
2761 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2762 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2763 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2764 // TODO: support unroll factor > 1.
2765 Plan.setUF(1);
2766}
2767
2769 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2770 // There should be only one EVL PHI in the entire plan.
2771 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2772
2775 for (VPRecipeBase &R : VPBB->phis())
2776 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2777 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2778 EVLPhi = PhiR;
2779 }
2780
2781 // Early return if no EVL PHI is found.
2782 if (!EVLPhi)
2783 return;
2784
2785 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2786 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2787 VPValue *AVL;
2788 [[maybe_unused]] bool FoundAVL =
2789 match(EVLIncrement,
2790 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2791 assert(FoundAVL && "Didn't find AVL?");
2792
2793 // The AVL may be capped to a safe distance.
2794 VPValue *SafeAVL;
2795 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2796 AVL = SafeAVL;
2797
2798 VPValue *AVLNext;
2799 [[maybe_unused]] bool FoundAVLNext =
2801 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2802 assert(FoundAVLNext && "Didn't find AVL backedge?");
2803
2804 // Convert EVLPhi to concrete recipe.
2805 auto *ScalarR =
2806 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2807 EVLPhi->getDebugLoc(), "evl.based.iv");
2808 EVLPhi->replaceAllUsesWith(ScalarR);
2809 EVLPhi->eraseFromParent();
2810
2811 // Replace CanonicalIVInc with EVL-PHI increment.
2812 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2813 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2814 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2815 m_Specific(&Plan.getVFxUF()))) &&
2816 "Unexpected canonical iv");
2817 Backedge->replaceAllUsesWith(EVLIncrement);
2818
2819 // Remove unused phi and increment.
2820 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2821 CanonicalIVIncrement->eraseFromParent();
2822 CanonicalIV->eraseFromParent();
2823
2824 // Replace the use of VectorTripCount in the latch-exiting block.
2825 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2826 // After: (branch-on-cond eq AVLNext, 0)
2827
2828 VPBasicBlock *LatchExiting =
2829 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2830 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2831 // Skip single-iteration loop region
2832 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2833 return;
2834 assert(LatchExitingBr &&
2835 match(LatchExitingBr,
2836 m_BranchOnCount(m_VPValue(EVLIncrement),
2837 m_Specific(&Plan.getVectorTripCount()))) &&
2838 "Unexpected terminator in EVL loop");
2839
2840 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2841 VPBuilder Builder(LatchExitingBr);
2842 VPValue *Cmp =
2843 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2845 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2846 LatchExitingBr->eraseFromParent();
2847}
2848
2850 VPlan &Plan, PredicatedScalarEvolution &PSE,
2851 const DenseMap<Value *, const SCEV *> &StridesMap) {
2852 // Replace VPValues for known constant strides guaranteed by predicate scalar
2853 // evolution.
2854 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2855 auto *R = cast<VPRecipeBase>(&U);
2856 return R->getParent()->getParent() ||
2857 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2858 };
2859 ValueToSCEVMapTy RewriteMap;
2860 for (const SCEV *Stride : StridesMap.values()) {
2861 using namespace SCEVPatternMatch;
2862 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2863 const APInt *StrideConst;
2864 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2865 // Only handle constant strides for now.
2866 continue;
2867
2868 auto *CI =
2869 Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
2870 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2871 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2872
2873 // The versioned value may not be used in the loop directly but through a
2874 // sext/zext. Add new live-ins in those cases.
2875 for (Value *U : StrideV->users()) {
2877 continue;
2878 VPValue *StrideVPV = Plan.getLiveIn(U);
2879 if (!StrideVPV)
2880 continue;
2881 unsigned BW = U->getType()->getScalarSizeInBits();
2882 APInt C =
2883 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2884 VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
2885 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2886 }
2887 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
2888 }
2889
2890 for (VPRecipeBase &R : *Plan.getEntry()) {
2891 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
2892 if (!ExpSCEV)
2893 continue;
2894 const SCEV *ScevExpr = ExpSCEV->getSCEV();
2895 auto *NewSCEV =
2896 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
2897 if (NewSCEV != ScevExpr) {
2898 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
2899 ExpSCEV->replaceAllUsesWith(NewExp);
2900 if (Plan.getTripCount() == ExpSCEV)
2901 Plan.resetTripCount(NewExp);
2902 }
2903 }
2904}
2905
2907 VPlan &Plan,
2908 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2909 // Collect recipes in the backward slice of `Root` that may generate a poison
2910 // value that is used after vectorization.
2912 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2914 Worklist.push_back(Root);
2915
2916 // Traverse the backward slice of Root through its use-def chain.
2917 while (!Worklist.empty()) {
2918 VPRecipeBase *CurRec = Worklist.pop_back_val();
2919
2920 if (!Visited.insert(CurRec).second)
2921 continue;
2922
2923 // Prune search if we find another recipe generating a widen memory
2924 // instruction. Widen memory instructions involved in address computation
2925 // will lead to gather/scatter instructions, which don't need to be
2926 // handled.
2928 VPHeaderPHIRecipe>(CurRec))
2929 continue;
2930
2931 // This recipe contributes to the address computation of a widen
2932 // load/store. If the underlying instruction has poison-generating flags,
2933 // drop them directly.
2934 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2935 VPValue *A, *B;
2936 // Dropping disjoint from an OR may yield incorrect results, as some
2937 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2938 // for dependence analysis). Instead, replace it with an equivalent Add.
2939 // This is possible as all users of the disjoint OR only access lanes
2940 // where the operands are disjoint or poison otherwise.
2941 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
2942 RecWithFlags->isDisjoint()) {
2943 VPBuilder Builder(RecWithFlags);
2944 VPInstruction *New = Builder.createOverflowingOp(
2945 Instruction::Add, {A, B}, {false, false},
2946 RecWithFlags->getDebugLoc());
2947 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2948 RecWithFlags->replaceAllUsesWith(New);
2949 RecWithFlags->eraseFromParent();
2950 CurRec = New;
2951 } else
2952 RecWithFlags->dropPoisonGeneratingFlags();
2953 } else {
2956 (void)Instr;
2957 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2958 "found instruction with poison generating flags not covered by "
2959 "VPRecipeWithIRFlags");
2960 }
2961
2962 // Add new definitions to the worklist.
2963 for (VPValue *Operand : CurRec->operands())
2964 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2965 Worklist.push_back(OpDef);
2966 }
2967 });
2968
2969 // Traverse all the recipes in the VPlan and collect the poison-generating
2970 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2971 // VPInterleaveRecipe.
2972 auto Iter = vp_depth_first_deep(Plan.getEntry());
2974 for (VPRecipeBase &Recipe : *VPBB) {
2975 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
2976 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2977 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2978 if (AddrDef && WidenRec->isConsecutive() &&
2979 BlockNeedsPredication(UnderlyingInstr.getParent()))
2980 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2981 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
2982 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2983 if (AddrDef) {
2984 // Check if any member of the interleave group needs predication.
2985 const InterleaveGroup<Instruction> *InterGroup =
2986 InterleaveRec->getInterleaveGroup();
2987 bool NeedPredication = false;
2988 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2989 I < NumMembers; ++I) {
2990 Instruction *Member = InterGroup->getMember(I);
2991 if (Member)
2992 NeedPredication |= BlockNeedsPredication(Member->getParent());
2993 }
2994
2995 if (NeedPredication)
2996 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2997 }
2998 }
2999 }
3000 }
3001}
3002
3004 VPlan &Plan,
3006 &InterleaveGroups,
3007 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3008 if (InterleaveGroups.empty())
3009 return;
3010
3011 // Interleave memory: for each Interleave Group we marked earlier as relevant
3012 // for this VPlan, replace the Recipes widening its memory instructions with a
3013 // single VPInterleaveRecipe at its insertion point.
3014 VPDominatorTree VPDT;
3015 VPDT.recalculate(Plan);
3016 for (const auto *IG : InterleaveGroups) {
3017 auto *Start =
3018 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3019 VPIRMetadata InterleaveMD(*Start);
3020 SmallVector<VPValue *, 4> StoredValues;
3021 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3022 StoredValues.push_back(StoreR->getStoredValue());
3023 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3024 Instruction *MemberI = IG->getMember(I);
3025 if (!MemberI)
3026 continue;
3027 VPWidenMemoryRecipe *MemoryR =
3028 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3029 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3030 StoredValues.push_back(StoreR->getStoredValue());
3031 InterleaveMD.intersect(*MemoryR);
3032 }
3033
3034 bool NeedsMaskForGaps =
3035 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3036 (!StoredValues.empty() && !IG->isFull());
3037
3038 Instruction *IRInsertPos = IG->getInsertPos();
3039 auto *InsertPos =
3040 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3041
3043 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3044 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3045 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3046
3047 // Get or create the start address for the interleave group.
3048 VPValue *Addr = Start->getAddr();
3049 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3050 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3051 // We cannot re-use the address of member zero because it does not
3052 // dominate the insert position. Instead, use the address of the insert
3053 // position and create a PtrAdd adjusting it to the address of member
3054 // zero.
3055 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3056 // InsertPos or sink loads above zero members to join it.
3057 assert(IG->getIndex(IRInsertPos) != 0 &&
3058 "index of insert position shouldn't be zero");
3059 auto &DL = IRInsertPos->getDataLayout();
3060 APInt Offset(32,
3061 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3062 IG->getIndex(IRInsertPos),
3063 /*IsSigned=*/true);
3064 VPValue *OffsetVPV =
3065 Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
3066 VPBuilder B(InsertPos);
3067 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3068 }
3069 // If the group is reverse, adjust the index to refer to the last vector
3070 // lane instead of the first. We adjust the index from the first vector
3071 // lane, rather than directly getting the pointer for lane VF - 1, because
3072 // the pointer operand of the interleaved access is supposed to be uniform.
3073 if (IG->isReverse()) {
3074 auto *ReversePtr = new VPVectorEndPointerRecipe(
3075 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3076 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3077 ReversePtr->insertBefore(InsertPos);
3078 Addr = ReversePtr;
3079 }
3080 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3081 InsertPos->getMask(), NeedsMaskForGaps,
3082 InterleaveMD, InsertPos->getDebugLoc());
3083 VPIG->insertBefore(InsertPos);
3084
3085 unsigned J = 0;
3086 for (unsigned i = 0; i < IG->getFactor(); ++i)
3087 if (Instruction *Member = IG->getMember(i)) {
3088 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3089 if (!Member->getType()->isVoidTy()) {
3090 VPValue *OriginalV = MemberR->getVPSingleValue();
3091 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3092 J++;
3093 }
3094 MemberR->eraseFromParent();
3095 }
3096 }
3097}
3098
3099/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3100/// value, phi and backedge value. In the following example:
3101///
3102/// vector.ph:
3103/// Successor(s): vector loop
3104///
3105/// <x1> vector loop: {
3106/// vector.body:
3107/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3108/// ...
3109/// EMIT branch-on-count ...
3110/// No successors
3111/// }
3112///
3113/// WIDEN-INDUCTION will get expanded to:
3114///
3115/// vector.ph:
3116/// ...
3117/// vp<%induction.start> = ...
3118/// vp<%induction.increment> = ...
3119///
3120/// Successor(s): vector loop
3121///
3122/// <x1> vector loop: {
3123/// vector.body:
3124/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3125/// ...
3126/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3127/// EMIT branch-on-count ...
3128/// No successors
3129/// }
3130static void
3132 VPTypeAnalysis &TypeInfo) {
3133 VPlan *Plan = WidenIVR->getParent()->getPlan();
3134 VPValue *Start = WidenIVR->getStartValue();
3135 VPValue *Step = WidenIVR->getStepValue();
3136 VPValue *VF = WidenIVR->getVFValue();
3137 DebugLoc DL = WidenIVR->getDebugLoc();
3138
3139 // The value from the original loop to which we are mapping the new induction
3140 // variable.
3141 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3142
3143 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3146 // FIXME: The newly created binary instructions should contain nsw/nuw
3147 // flags, which can be found from the original scalar operations.
3148 VPIRFlags Flags;
3149 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3150 AddOp = Instruction::Add;
3151 MulOp = Instruction::Mul;
3152 } else {
3153 AddOp = ID.getInductionOpcode();
3154 MulOp = Instruction::FMul;
3155 Flags = ID.getInductionBinOp()->getFastMathFlags();
3156 }
3157
3158 // If the phi is truncated, truncate the start and step values.
3159 VPBuilder Builder(Plan->getVectorPreheader());
3160 Type *StepTy = TypeInfo.inferScalarType(Step);
3161 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3162 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3163 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3164 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3165 StepTy = Ty;
3166 }
3167
3168 // Construct the initial value of the vector IV in the vector loop preheader.
3169 Type *IVIntTy =
3171 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3172 if (StepTy->isFloatingPointTy())
3173 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3174
3175 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3176 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3177
3178 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3179 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3180 DebugLoc::getUnknown(), "induction");
3181
3182 // Create the widened phi of the vector IV.
3183 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3184 WidenIVR->getDebugLoc(), "vec.ind");
3185 WidePHI->insertBefore(WidenIVR);
3186
3187 // Create the backedge value for the vector IV.
3188 VPValue *Inc;
3189 VPValue *Prev;
3190 // If unrolled, use the increment and prev value from the operands.
3191 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3192 Inc = SplatVF;
3193 Prev = WidenIVR->getLastUnrolledPartOperand();
3194 } else {
3195 if (VPRecipeBase *R = VF->getDefiningRecipe())
3196 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3197 // Multiply the vectorization factor by the step using integer or
3198 // floating-point arithmetic as appropriate.
3199 if (StepTy->isFloatingPointTy())
3200 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3201 DL);
3202 else
3203 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3204 TypeInfo.inferScalarType(VF), DL);
3205
3206 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3207 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3208 Prev = WidePHI;
3209 }
3210
3212 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3213 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3214 WidenIVR->getDebugLoc(), "vec.ind.next");
3215
3216 WidePHI->addOperand(Next);
3217
3218 WidenIVR->replaceAllUsesWith(WidePHI);
3219}
3220
3221/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3222/// initial value, phi and backedge value. In the following example:
3223///
3224/// <x1> vector loop: {
3225/// vector.body:
3226/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3227/// ...
3228/// EMIT branch-on-count ...
3229/// }
3230///
3231/// WIDEN-POINTER-INDUCTION will get expanded to:
3232///
3233/// <x1> vector loop: {
3234/// vector.body:
3235/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3236/// EMIT %mul = mul %stepvector, %step
3237/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3238/// ...
3239/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3240/// EMIT branch-on-count ...
3241/// }
3243 VPTypeAnalysis &TypeInfo) {
3244 VPlan *Plan = R->getParent()->getPlan();
3245 VPValue *Start = R->getStartValue();
3246 VPValue *Step = R->getStepValue();
3247 VPValue *VF = R->getVFValue();
3248
3249 assert(R->getInductionDescriptor().getKind() ==
3251 "Not a pointer induction according to InductionDescriptor!");
3252 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3253 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3254 "Recipe should have been replaced");
3255
3256 VPBuilder Builder(R);
3257 DebugLoc DL = R->getDebugLoc();
3258
3259 // Build a scalar pointer phi.
3260 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3261
3262 // Create actual address geps that use the pointer phi as base and a
3263 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3264 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3265 Type *StepTy = TypeInfo.inferScalarType(Step);
3266 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3267 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3268 VPValue *PtrAdd = Builder.createNaryOp(
3269 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3270 R->replaceAllUsesWith(PtrAdd);
3271
3272 // Create the backedge value for the scalar pointer phi.
3274 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3275 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3276 DL);
3277 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3278
3279 VPValue *InductionGEP =
3280 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3281 ScalarPtrPhi->addOperand(InductionGEP);
3282}
3283
3285 // Replace loop regions with explicity CFG.
3286 SmallVector<VPRegionBlock *> LoopRegions;
3288 vp_depth_first_deep(Plan.getEntry()))) {
3289 if (!R->isReplicator())
3290 LoopRegions.push_back(R);
3291 }
3292 for (VPRegionBlock *R : LoopRegions)
3293 R->dissolveToCFGLoop();
3294}
3295
3297 VPTypeAnalysis TypeInfo(Plan);
3300 vp_depth_first_deep(Plan.getEntry()))) {
3301 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3302 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3303 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3304 ToRemove.push_back(WidenIVR);
3305 continue;
3306 }
3307
3308 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3309 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3310 ToRemove.push_back(WidenIVR);
3311 continue;
3312 }
3313
3314 // Expand VPBlendRecipe into VPInstruction::Select.
3315 VPBuilder Builder(&R);
3316 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3317 VPValue *Select = Blend->getIncomingValue(0);
3318 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3319 Select = Builder.createSelect(Blend->getMask(I),
3320 Blend->getIncomingValue(I), Select,
3321 R.getDebugLoc(), "predphi");
3322 Blend->replaceAllUsesWith(Select);
3323 ToRemove.push_back(Blend);
3324 }
3325
3326 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3327 Expr->decompose();
3328 ToRemove.push_back(Expr);
3329 }
3330
3331 VPValue *VectorStep;
3332 VPValue *ScalarStep;
3334 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3335 continue;
3336
3337 // Expand WideIVStep.
3338 auto *VPI = cast<VPInstruction>(&R);
3339 Type *IVTy = TypeInfo.inferScalarType(VPI);
3340 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3342 ? Instruction::UIToFP
3343 : Instruction::Trunc;
3344 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3345 }
3346
3347 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3348 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3349 ScalarStep =
3350 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3351 }
3352
3353 VPIRFlags Flags;
3354 if (IVTy->isFloatingPointTy())
3355 Flags = {VPI->getFastMathFlags()};
3356
3357 unsigned MulOpc =
3358 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3359 VPInstruction *Mul = Builder.createNaryOp(
3360 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3361 VectorStep = Mul;
3362 VPI->replaceAllUsesWith(VectorStep);
3363 ToRemove.push_back(VPI);
3364 }
3365 }
3366
3367 for (VPRecipeBase *R : ToRemove)
3368 R->eraseFromParent();
3369}
3370
3372 VPBasicBlock *EarlyExitVPBB,
3373 VPlan &Plan,
3374 VPBasicBlock *HeaderVPBB,
3375 VPBasicBlock *LatchVPBB) {
3376 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3377 if (!EarlyExitVPBB->getSinglePredecessor() &&
3378 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3379 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3380 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3381 "unsupported early exit VPBB");
3382 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3383 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3384 // of the phis.
3385 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3386 cast<VPIRPhi>(&R)->swapOperands();
3387 }
3388
3389 VPBuilder Builder(LatchVPBB->getTerminator());
3390 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3391 assert(
3392 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
3393 "Terminator must be be BranchOnCond");
3394 VPValue *CondOfEarlyExitingVPBB =
3395 EarlyExitingVPBB->getTerminator()->getOperand(0);
3396 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3397 ? CondOfEarlyExitingVPBB
3398 : Builder.createNot(CondOfEarlyExitingVPBB);
3399
3400 // Split the middle block and have it conditionally branch to the early exit
3401 // block if CondToEarlyExit.
3402 VPValue *IsEarlyExitTaken =
3403 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3404 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3405 VPBasicBlock *VectorEarlyExitVPBB =
3406 Plan.createVPBasicBlock("vector.early.exit");
3407 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3408 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3409 NewMiddle->swapSuccessors();
3410
3411 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3412
3413 // Update the exit phis in the early exit block.
3414 VPBuilder MiddleBuilder(NewMiddle);
3415 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3416 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3417 auto *ExitIRI = cast<VPIRPhi>(&R);
3418 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3419 // a single predecessor and 1 if it has two.
3420 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3421 if (ExitIRI->getNumOperands() != 1) {
3422 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3423 // predecessor. Extract its last lane.
3424 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3425 }
3426
3427 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3428 if (!IncomingFromEarlyExit->isLiveIn()) {
3429 // Update the incoming value from the early exit.
3430 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3431 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3432 "first.active.lane");
3433 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3434 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3435 nullptr, "early.exit.value");
3436 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3437 }
3438 }
3439 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3440
3441 // Replace the condition controlling the non-early exit from the vector loop
3442 // with one exiting if either the original condition of the vector latch is
3443 // true or the early exit has been taken.
3444 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3445 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3446 "Unexpected terminator");
3447 auto *IsLatchExitTaken =
3448 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3449 LatchExitingBranch->getOperand(1));
3450 auto *AnyExitTaken = Builder.createNaryOp(
3451 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3452 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3453 LatchExitingBranch->eraseFromParent();
3454}
3455
3456/// This function tries convert extended in-loop reductions to
3457/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3458/// valid. The created recipe must be decomposed to its constituent
3459/// recipes before execution.
3460static VPExpressionRecipe *
3462 VFRange &Range) {
3463 Type *RedTy = Ctx.Types.inferScalarType(Red);
3464 VPValue *VecOp = Red->getVecOp();
3465
3466 // Clamp the range if using extended-reduction is profitable.
3467 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
3468 Type *SrcTy) -> bool {
3470 [&](ElementCount VF) {
3471 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3473 InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3474 Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
3475 CostKind);
3476 InstructionCost ExtCost =
3477 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3478 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3479 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3480 },
3481 Range);
3482 };
3483
3484 VPValue *A;
3485 // Match reduce(ext)).
3486 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3487 IsExtendedRedValidAndClampRange(
3488 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3489 cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
3490 Instruction::CastOps::ZExt,
3491 Ctx.Types.inferScalarType(A)))
3492 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3493
3494 return nullptr;
3495}
3496
3497/// This function tries convert extended in-loop reductions to
3498/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3499/// and valid. The created VPExpressionRecipe must be decomposed to its
3500/// constituent recipes before execution. Patterns of the
3501/// VPExpressionRecipe:
3502/// reduce.add(mul(...)),
3503/// reduce.add(mul(ext(A), ext(B))),
3504/// reduce.add(ext(mul(ext(A), ext(B)))).
3505static VPExpressionRecipe *
3507 VPCostContext &Ctx, VFRange &Range) {
3508 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3509 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3510 return nullptr;
3511
3512 Type *RedTy = Ctx.Types.inferScalarType(Red);
3513
3514 // Clamp the range if using multiply-accumulate-reduction is profitable.
3515 auto IsMulAccValidAndClampRange =
3516 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
3517 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
3519 [&](ElementCount VF) {
3521 Type *SrcTy =
3522 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3523 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3524 InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
3525 isZExt, Opcode, RedTy, SrcVecTy, CostKind);
3526 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3527 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3528 InstructionCost ExtCost = 0;
3529 if (Ext0)
3530 ExtCost += Ext0->computeCost(VF, Ctx);
3531 if (Ext1)
3532 ExtCost += Ext1->computeCost(VF, Ctx);
3533 if (OuterExt)
3534 ExtCost += OuterExt->computeCost(VF, Ctx);
3535
3536 return MulAccCost.isValid() &&
3537 MulAccCost < ExtCost + MulCost + RedCost;
3538 },
3539 Range);
3540 };
3541
3542 VPValue *VecOp = Red->getVecOp();
3543 VPRecipeBase *Sub = nullptr;
3544 VPValue *A, *B;
3545 VPValue *Tmp = nullptr;
3546 // Sub reductions could have a sub between the add reduction and vec op.
3547 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3548 Sub = VecOp->getDefiningRecipe();
3549 VecOp = Tmp;
3550 }
3551 // Try to match reduce.add(mul(...)).
3552 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3553 auto *RecipeA =
3554 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3555 auto *RecipeB =
3556 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3557 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3558
3559 // Match reduce.add(mul(ext, ext)).
3560 if (RecipeA && RecipeB &&
3561 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
3562 match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3563 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3564 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
3565 Instruction::CastOps::ZExt,
3566 Mul, RecipeA, RecipeB, nullptr)) {
3567 if (Sub)
3568 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3569 cast<VPWidenRecipe>(Sub), Red);
3570 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3571 }
3572 // Match reduce.add(mul).
3573 // TODO: Add an expression type for this variant with a negated mul
3574 if (!Sub &&
3575 IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
3576 return new VPExpressionRecipe(Mul, Red);
3577 }
3578 // TODO: Add an expression type for negated versions of other expression
3579 // variants.
3580 if (Sub)
3581 return nullptr;
3582 // Match reduce.add(ext(mul(ext(A), ext(B)))).
3583 // All extend recipes must have same opcode or A == B
3584 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
3586 m_ZExtOrSExt(m_VPValue()))))) {
3587 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3588 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3589 auto *Ext0 =
3590 cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
3591 auto *Ext1 =
3592 cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
3593 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3594 Ext0->getOpcode() == Ext1->getOpcode() &&
3595 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
3596 Instruction::CastOps::ZExt,
3597 Mul, Ext0, Ext1, Ext)) {
3598 auto *NewExt0 = new VPWidenCastRecipe(
3599 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3600 *Ext0, Ext0->getDebugLoc());
3601 NewExt0->insertBefore(Ext0);
3602
3603 VPWidenCastRecipe *NewExt1 = NewExt0;
3604 if (Ext0 != Ext1) {
3605 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3606 Ext->getResultType(), *Ext1, *Ext1,
3607 Ext1->getDebugLoc());
3608 NewExt1->insertBefore(Ext1);
3609 }
3610 Mul->setOperand(0, NewExt0);
3611 Mul->setOperand(1, NewExt1);
3612 Red->setOperand(1, Mul);
3613 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3614 }
3615 }
3616 return nullptr;
3617}
3618
3619/// This function tries to create abstract recipes from the reduction recipe for
3620/// following optimizations and cost estimation.
3622 VPCostContext &Ctx,
3623 VFRange &Range) {
3624 VPExpressionRecipe *AbstractR = nullptr;
3625 auto IP = std::next(Red->getIterator());
3626 auto *VPBB = Red->getParent();
3627 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3628 AbstractR = MulAcc;
3629 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3630 AbstractR = ExtRed;
3631 // Cannot create abstract inloop reduction recipes.
3632 if (!AbstractR)
3633 return;
3634
3635 AbstractR->insertBefore(*VPBB, IP);
3636 Red->replaceAllUsesWith(AbstractR);
3637}
3638
3649
3651 if (Plan.hasScalarVFOnly())
3652 return;
3653
3654#ifndef NDEBUG
3655 VPDominatorTree VPDT;
3656 VPDT.recalculate(Plan);
3657#endif
3658
3659 SmallVector<VPValue *> VPValues;
3662 append_range(VPValues, Plan.getLiveIns());
3663 for (VPRecipeBase &R : *Plan.getEntry())
3664 append_range(VPValues, R.definedValues());
3665
3666 auto *VectorPreheader = Plan.getVectorPreheader();
3667 for (VPValue *VPV : VPValues) {
3669 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3670 isa<Constant>(VPV->getLiveInIRValue())))
3671 continue;
3672
3673 // Add explicit broadcast at the insert point that dominates all users.
3674 VPBasicBlock *HoistBlock = VectorPreheader;
3675 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3676 for (VPUser *User : VPV->users()) {
3677 if (User->usesScalars(VPV))
3678 continue;
3679 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3680 HoistPoint = HoistBlock->begin();
3681 else
3682 assert(VPDT.dominates(VectorPreheader,
3683 cast<VPRecipeBase>(User)->getParent()) &&
3684 "All users must be in the vector preheader or dominated by it");
3685 }
3686
3687 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3688 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3689 VPV->replaceUsesWithIf(Broadcast,
3690 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3691 return Broadcast != &U && !U.usesScalars(VPV);
3692 });
3693 }
3694}
3695
3697 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3699 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3700 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3701
3702 VPValue *TC = Plan.getTripCount();
3703 // Skip cases for which the trip count may be non-trivial to materialize.
3704 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3705 // tail is required.
3706 if (!Plan.hasScalarTail() ||
3708 Plan.getScalarPreheader() ||
3709 !TC->isLiveIn())
3710 return;
3711
3712 // Materialize vector trip counts for constants early if it can simply
3713 // be computed as (Original TC / VF * UF) * VF * UF.
3714 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3715 // tail-folded loops.
3716 ScalarEvolution &SE = *PSE.getSE();
3717 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3718 if (!isa<SCEVConstant>(TCScev))
3719 return;
3720 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3721 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3722 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3723 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3724}
3725
3727 VPBasicBlock *VectorPH) {
3729 if (BTC->getNumUsers() == 0)
3730 return;
3731
3732 VPBuilder Builder(VectorPH, VectorPH->begin());
3733 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3734 auto *TCMO = Builder.createNaryOp(
3735 Instruction::Sub,
3736 {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
3737 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3738 BTC->replaceAllUsesWith(TCMO);
3739}
3740
3742 if (Plan.hasScalarVFOnly())
3743 return;
3744
3745 VPTypeAnalysis TypeInfo(Plan);
3746 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3747 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3749 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3750 vp_depth_first_shallow(LoopRegion->getEntry()));
3751 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3752 // VPInstructions, excluding ones in replicate regions. Those are not
3753 // materialized explicitly yet. Those vector users are still handled in
3754 // VPReplicateRegion::execute(), via shouldPack().
3755 // TODO: materialize build vectors for replicating recipes in replicating
3756 // regions.
3757 for (VPBasicBlock *VPBB :
3758 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3759 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3761 continue;
3762 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3763 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3764 VPRegionBlock *ParentRegion =
3766 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3767 };
3768 if ((isa<VPReplicateRecipe>(DefR) &&
3769 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3770 (isa<VPInstruction>(DefR) &&
3772 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3773 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3774 continue;
3775
3776 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3777 unsigned Opcode = ScalarTy->isStructTy()
3780 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3781 BuildVector->insertAfter(DefR);
3782
3783 DefR->replaceUsesWithIf(
3784 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3785 VPUser &U, unsigned) {
3786 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3787 });
3788 }
3789 }
3790}
3791
3793 VPBasicBlock *VectorPHVPBB,
3794 bool TailByMasking,
3795 bool RequiresScalarEpilogue) {
3796 VPValue &VectorTC = Plan.getVectorTripCount();
3797 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3798 // There's nothing to do if there are no users of the vector trip count or its
3799 // IR value has already been set.
3800 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3801 return;
3802
3803 VPValue *TC = Plan.getTripCount();
3804 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3805 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3806 VPValue *Step = &Plan.getVFxUF();
3807
3808 // If the tail is to be folded by masking, round the number of iterations N
3809 // up to a multiple of Step instead of rounding down. This is done by first
3810 // adding Step-1 and then rounding down. Note that it's ok if this addition
3811 // overflows: the vector induction variable will eventually wrap to zero given
3812 // that it starts at zero and its Step is a power of two; the loop will then
3813 // exit, with the last early-exit vector comparison also producing all-true.
3814 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3815 // is accounted for in emitIterationCountCheck that adds an overflow check.
3816 if (TailByMasking) {
3817 TC = Builder.createNaryOp(
3818 Instruction::Add,
3819 {TC, Builder.createNaryOp(
3820 Instruction::Sub,
3821 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3822 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3823 }
3824
3825 // Now we need to generate the expression for the part of the loop that the
3826 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3827 // iterations are not required for correctness, or N - Step, otherwise. Step
3828 // is equal to the vectorization factor (number of SIMD elements) times the
3829 // unroll factor (number of SIMD instructions).
3830 VPValue *R =
3831 Builder.createNaryOp(Instruction::URem, {TC, Step},
3832 DebugLoc::getCompilerGenerated(), "n.mod.vf");
3833
3834 // There are cases where we *must* run at least one iteration in the remainder
3835 // loop. See the cost model for when this can happen. If the step evenly
3836 // divides the trip count, we set the remainder to be equal to the step. If
3837 // the step does not evenly divide the trip count, no adjustment is necessary
3838 // since there will already be scalar iterations. Note that the minimum
3839 // iterations check ensures that N >= Step.
3840 if (RequiresScalarEpilogue) {
3841 assert(!TailByMasking &&
3842 "requiring scalar epilogue is not supported with fail folding");
3843 VPValue *IsZero = Builder.createICmp(
3844 CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
3845 R = Builder.createSelect(IsZero, Step, R);
3846 }
3847
3848 VPValue *Res = Builder.createNaryOp(
3849 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
3850 VectorTC.replaceAllUsesWith(Res);
3851}
3852
3854 ElementCount VFEC) {
3855 VPBuilder Builder(VectorPH, VectorPH->begin());
3856 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3857 VPValue &VF = Plan.getVF();
3858 VPValue &VFxUF = Plan.getVFxUF();
3859 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
3860 // used.
3861 // TODO: Assert that they aren't used.
3862
3863 // If there are no users of the runtime VF, compute VFxUF by constant folding
3864 // the multiplication of VF and UF.
3865 if (VF.getNumUsers() == 0) {
3866 VPValue *RuntimeVFxUF =
3867 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
3868 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
3869 return;
3870 }
3871
3872 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
3873 // vscale) * UF.
3874 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
3876 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
3878 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
3879 }
3880 VF.replaceAllUsesWith(RuntimeVF);
3881
3882 VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
3883 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
3884 VFxUF.replaceAllUsesWith(MulByUF);
3885}
3886
3889 const DataLayout &DL = SE.getDataLayout();
3890 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
3891
3892 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
3893 BasicBlock *EntryBB = Entry->getIRBasicBlock();
3894 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
3895 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
3897 continue;
3898 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3899 if (!ExpSCEV)
3900 break;
3901 const SCEV *Expr = ExpSCEV->getSCEV();
3902 Value *Res =
3903 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
3904 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
3905 VPValue *Exp = Plan.getOrAddLiveIn(Res);
3906 ExpSCEV->replaceAllUsesWith(Exp);
3907 if (Plan.getTripCount() == ExpSCEV)
3908 Plan.resetTripCount(Exp);
3909 ExpSCEV->eraseFromParent();
3910 }
3912 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
3913 "after any VPIRInstructions");
3914 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
3915 // to the VPIRBasicBlock.
3916 auto EI = Entry->begin();
3917 for (Instruction &I : drop_end(*EntryBB)) {
3918 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
3919 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
3920 EI++;
3921 continue;
3922 }
3924 }
3925
3926 return ExpandedSCEVs;
3927}
3928
3929/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3930/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
3931/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
3932/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
3933/// an index-independent load if it feeds all wide ops at all indices (\p OpV
3934/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
3935/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3936/// is defined at \p Idx of a load interleave group.
3937static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
3938 VPValue *OpV, unsigned Idx) {
3939 auto *DefR = OpV->getDefiningRecipe();
3940 if (!DefR)
3941 return WideMember0->getOperand(OpIdx) == OpV;
3942 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
3943 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
3944
3945 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
3946 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
3947 return false;
3948}
3949
3950/// Returns true if \p IR is a full interleave group with factor and number of
3951/// members both equal to \p VF. The interleave group must also access the full
3952/// vector width \p VectorRegWidth.
3954 unsigned VF, VPTypeAnalysis &TypeInfo,
3955 unsigned VectorRegWidth) {
3956 if (!InterleaveR)
3957 return false;
3958
3959 Type *GroupElementTy = nullptr;
3960 if (InterleaveR->getStoredValues().empty()) {
3961 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
3962 if (!all_of(InterleaveR->definedValues(),
3963 [&TypeInfo, GroupElementTy](VPValue *Op) {
3964 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3965 }))
3966 return false;
3967 } else {
3968 GroupElementTy =
3969 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
3970 if (!all_of(InterleaveR->getStoredValues(),
3971 [&TypeInfo, GroupElementTy](VPValue *Op) {
3972 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3973 }))
3974 return false;
3975 }
3976
3977 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3978 auto IG = InterleaveR->getInterleaveGroup();
3979 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3980 GroupSize == VectorRegWidth;
3981}
3982
3983/// Returns true if \p VPValue is a narrow VPValue.
3984static bool isAlreadyNarrow(VPValue *VPV) {
3985 if (VPV->isLiveIn())
3986 return true;
3987 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
3988 return RepR && RepR->isSingleScalar();
3989}
3990
3992 unsigned VectorRegWidth) {
3993 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3994 if (!VectorLoop)
3995 return;
3996
3997 VPTypeAnalysis TypeInfo(Plan);
3998
3999 unsigned VFMinVal = VF.getKnownMinValue();
4001 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4004 continue;
4005
4008 continue;
4009
4010 // Bail out on recipes not supported at the moment:
4011 // * phi recipes other than the canonical induction
4012 // * recipes writing to memory except interleave groups
4013 // Only support plans with a canonical induction phi.
4014 if (R.isPhi())
4015 return;
4016
4017 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4018 if (R.mayWriteToMemory() && !InterleaveR)
4019 return;
4020
4021 // Do not narrow interleave groups if there are VectorPointer recipes and
4022 // the plan was unrolled. The recipe implicitly uses VF from
4023 // VPTransformState.
4024 // TODO: Remove restriction once the VF for the VectorPointer offset is
4025 // modeled explicitly as operand.
4026 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4027 return;
4028
4029 // All other ops are allowed, but we reject uses that cannot be converted
4030 // when checking all allowed consumers (store interleave groups) below.
4031 if (!InterleaveR)
4032 continue;
4033
4034 // Bail out on non-consecutive interleave groups.
4035 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4036 VectorRegWidth))
4037 return;
4038
4039 // Skip read interleave groups.
4040 if (InterleaveR->getStoredValues().empty())
4041 continue;
4042
4043 // Narrow interleave groups, if all operands are already matching narrow
4044 // ops.
4045 auto *Member0 = InterleaveR->getStoredValues()[0];
4046 if (isAlreadyNarrow(Member0) &&
4047 all_of(InterleaveR->getStoredValues(),
4048 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4049 StoreGroups.push_back(InterleaveR);
4050 continue;
4051 }
4052
4053 // For now, we only support full interleave groups storing load interleave
4054 // groups.
4055 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4056 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4057 if (!DefR)
4058 return false;
4059 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4060 return IR && IR->getInterleaveGroup()->isFull() &&
4061 IR->getVPValue(Op.index()) == Op.value();
4062 })) {
4063 StoreGroups.push_back(InterleaveR);
4064 continue;
4065 }
4066
4067 // Check if all values feeding InterleaveR are matching wide recipes, which
4068 // operands that can be narrowed.
4069 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
4070 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
4071 if (!WideMember0)
4072 return;
4073 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4074 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4075 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4076 R->getNumOperands() > 2)
4077 return;
4078 if (any_of(enumerate(R->operands()),
4079 [WideMember0, Idx = I](const auto &P) {
4080 const auto &[OpIdx, OpV] = P;
4081 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4082 }))
4083 return;
4084 }
4085 StoreGroups.push_back(InterleaveR);
4086 }
4087
4088 if (StoreGroups.empty())
4089 return;
4090
4091 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4092 SmallPtrSet<VPValue *, 4> NarrowedOps;
4093 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
4094 auto *R = V->getDefiningRecipe();
4095 if (!R || NarrowedOps.contains(V))
4096 return V;
4097 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4098 // Narrow interleave group to wide load, as transformed VPlan will only
4099 // process one original iteration.
4100 auto *L = new VPWidenLoadRecipe(
4101 *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
4102 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4103 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4104 L->insertBefore(LoadGroup);
4105 NarrowedOps.insert(L);
4106 return L;
4107 }
4108
4109 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4110 assert(RepR->isSingleScalar() &&
4111 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4112 "must be a single scalar load");
4113 NarrowedOps.insert(RepR);
4114 return RepR;
4115 }
4116 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4117
4118 VPValue *PtrOp = WideLoad->getAddr();
4119 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4120 PtrOp = VecPtr->getOperand(0);
4121 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4122 // process one original iteration.
4123 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4124 /*IsUniform*/ true,
4125 /*Mask*/ nullptr, *WideLoad);
4126 N->insertBefore(WideLoad);
4127 NarrowedOps.insert(N);
4128 return N;
4129 };
4130
4131 // Narrow operation tree rooted at store groups.
4132 for (auto *StoreGroup : StoreGroups) {
4133 VPValue *Res = nullptr;
4134 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4135 if (isAlreadyNarrow(Member0)) {
4136 Res = Member0;
4137 } else if (auto *WideMember0 =
4139 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4140 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4141 Res = WideMember0;
4142 } else {
4143 Res = NarrowOp(Member0);
4144 }
4145
4146 auto *S = new VPWidenStoreRecipe(
4147 *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
4148 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4149 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4150 S->insertBefore(StoreGroup);
4151 StoreGroup->eraseFromParent();
4152 }
4153
4154 // Adjust induction to reflect that the transformed plan only processes one
4155 // original iteration.
4156 auto *CanIV = Plan.getCanonicalIV();
4157 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4158 VPBuilder PHBuilder(Plan.getVectorPreheader());
4159
4160 VPValue *UF = Plan.getOrAddLiveIn(
4161 ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
4162 if (VF.isScalable()) {
4163 VPValue *VScale = PHBuilder.createElementCount(
4164 CanIV->getScalarType(), ElementCount::getScalable(1));
4165 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4166 Inc->setOperand(1, VScaleUF);
4167 Plan.getVF().replaceAllUsesWith(VScale);
4168 } else {
4169 Inc->setOperand(1, UF);
4171 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
4172 }
4173 removeDeadRecipes(Plan);
4174}
4175
4176/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4177/// BranchOnCond recipe.
4179 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4180 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4181 auto *MiddleTerm =
4183 // Only add branch metadata if there is a (conditional) terminator.
4184 if (!MiddleTerm)
4185 return;
4186
4187 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4188 "must have a BranchOnCond");
4189 // Assume that `TripCount % VectorStep ` is equally distributed.
4190 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4191 if (VF.isScalable() && VScaleForTuning.has_value())
4192 VectorStep *= *VScaleForTuning;
4193 assert(VectorStep > 0 && "trip count should not be zero");
4194 MDBuilder MDB(Plan.getContext());
4195 MDNode *BranchWeights =
4196 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4197 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4198}
4199
4200/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4201/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4202/// the end value of the induction.
4204 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4205 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4206 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4207 // Truncated wide inductions resume from the last lane of their vector value
4208 // in the last vector iteration which is handled elsewhere.
4209 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4210 return nullptr;
4211
4212 VPValue *Start = WideIV->getStartValue();
4213 VPValue *Step = WideIV->getStepValue();
4215 VPValue *EndValue = VectorTC;
4216 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4217 EndValue = VectorPHBuilder.createDerivedIV(
4218 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4219 Start, VectorTC, Step);
4220 }
4221
4222 // EndValue is derived from the vector trip count (which has the same type as
4223 // the widest induction) and thus may be wider than the induction here.
4224 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4225 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4226 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4227 ScalarTypeOfWideIV,
4228 WideIV->getDebugLoc());
4229 }
4230
4231 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4232 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4233 return ResumePhiRecipe;
4234}
4235
4237 VPlan &Plan, VPRecipeBuilder &Builder,
4238 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4239 VPTypeAnalysis TypeInfo(Plan);
4240 auto *ScalarPH = Plan.getScalarPreheader();
4241 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4242 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4243 VPBuilder VectorPHBuilder(
4244 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4245 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4246 VPBuilder ScalarPHBuilder(ScalarPH);
4247 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4248 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4249
4250 // TODO: Extract final value from induction recipe initially, optimize to
4251 // pre-computed end value together in optimizeInductionExitUsers.
4252 auto *VectorPhiR =
4253 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4254 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4256 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4257 &Plan.getVectorTripCount())) {
4258 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4259 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4260 ScalarPhiIRI->addOperand(ResumePhi);
4261 continue;
4262 }
4263 // TODO: Also handle truncated inductions here. Computing end-values
4264 // separately should be done as VPlan-to-VPlan optimization, after
4265 // legalizing all resume values to use the last lane from the loop.
4266 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4267 "should only skip truncated wide inductions");
4268 continue;
4269 }
4270
4271 // The backedge value provides the value to resume coming out of a loop,
4272 // which for FORs is a vector whose last element needs to be extracted. The
4273 // start value provides the value if the loop is bypassed.
4274 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4275 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4276 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4277 "Cannot handle loops with uncountable early exits");
4278 if (IsFOR)
4279 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4280 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4281 "vector.recur.extract");
4282 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4283 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4284 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4285 ScalarPhiIRI->addOperand(ResumePhiR);
4286 }
4287}
4288
4290 VFRange &Range) {
4291 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4292 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4293 auto *MiddleVPBB = Plan.getMiddleBlock();
4294 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4295 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4296
4297 auto IsScalableOne = [](ElementCount VF) -> bool {
4298 return VF == ElementCount::getScalable(1);
4299 };
4300
4301 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4302 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4303 if (!FOR)
4304 continue;
4305
4306 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4307 "Cannot handle loops with uncountable early exits");
4308
4309 // This is the second phase of vectorizing first-order recurrences, creating
4310 // extract for users outside the loop. An overview of the transformation is
4311 // described below. Suppose we have the following loop with some use after
4312 // the loop of the last a[i-1],
4313 //
4314 // for (int i = 0; i < n; ++i) {
4315 // t = a[i - 1];
4316 // b[i] = a[i] - t;
4317 // }
4318 // use t;
4319 //
4320 // There is a first-order recurrence on "a". For this loop, the shorthand
4321 // scalar IR looks like:
4322 //
4323 // scalar.ph:
4324 // s.init = a[-1]
4325 // br scalar.body
4326 //
4327 // scalar.body:
4328 // i = phi [0, scalar.ph], [i+1, scalar.body]
4329 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4330 // s2 = a[i]
4331 // b[i] = s2 - s1
4332 // br cond, scalar.body, exit.block
4333 //
4334 // exit.block:
4335 // use = lcssa.phi [s1, scalar.body]
4336 //
4337 // In this example, s1 is a recurrence because it's value depends on the
4338 // previous iteration. In the first phase of vectorization, we created a
4339 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4340 // for users in the scalar preheader and exit block.
4341 //
4342 // vector.ph:
4343 // v_init = vector(..., ..., ..., a[-1])
4344 // br vector.body
4345 //
4346 // vector.body
4347 // i = phi [0, vector.ph], [i+4, vector.body]
4348 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4349 // v2 = a[i, i+1, i+2, i+3]
4350 // b[i] = v2 - v1
4351 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4352 // b[i, i+1, i+2, i+3] = v2 - v1
4353 // br cond, vector.body, middle.block
4354 //
4355 // middle.block:
4356 // vector.recur.extract.for.phi = v2(2)
4357 // vector.recur.extract = v2(3)
4358 // br cond, scalar.ph, exit.block
4359 //
4360 // scalar.ph:
4361 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4362 // [s.init, otherwise]
4363 // br scalar.body
4364 //
4365 // scalar.body:
4366 // i = phi [0, scalar.ph], [i+1, scalar.body]
4367 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4368 // s2 = a[i]
4369 // b[i] = s2 - s1
4370 // br cond, scalar.body, exit.block
4371 //
4372 // exit.block:
4373 // lo = lcssa.phi [s1, scalar.body],
4374 // [vector.recur.extract.for.phi, middle.block]
4375 //
4376 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4377 // Extract the penultimate value of the recurrence and use it as operand for
4378 // the VPIRInstruction modeling the phi.
4379 for (VPUser *U : FOR->users()) {
4380 using namespace llvm::VPlanPatternMatch;
4381 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4382 continue;
4383 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4384 // penultimate value of the recurrence. Instead we rely on the existing
4385 // extract of the last element from the result of
4386 // VPInstruction::FirstOrderRecurrenceSplice.
4387 // TODO: Consider vscale_range info and UF.
4389 Range))
4390 return;
4391 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4392 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4393 {}, "vector.recur.extract.for.phi");
4394 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4395 }
4396 }
4397}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:385
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static Value * tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1579
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3498
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3785
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3860
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3812
iterator end()
Definition VPlan.h:3822
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3820
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3873
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:246
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:619
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:591
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:664
const VPRecipeBase & back() const
Definition VPlan.h:3834
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2403
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2437
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2427
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2443
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2423
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:82
VPRegionBlock * getParent()
Definition VPlan.h:174
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:323
size_t getNumPredecessors() const
Definition VPlan.h:221
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:205
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:216
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:265
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:210
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:199
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:232
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:253
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:172
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:191
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:210
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2934
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3441
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:3468
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3529
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:2979
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1978
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2026
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2015
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3938
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:3962
Class to record and manage LLVM IR flags.
Definition VPlan.h:601
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:943
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:984
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1061
@ FirstOrderRecurrenceSplice
Definition VPlan.h:990
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1014
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1011
@ CanonicalIVIncrementForPart
Definition VPlan.h:1004
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2544
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2565
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2617
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2576
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3112
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:395
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition VPlan.h:478
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPBasicBlock * getParent()
Definition VPlan.h:416
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:483
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2812
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2666
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:3973
const VPBlockBase * getEntry() const
Definition VPlan.h:4009
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4026
const VPBlockBase * getExiting() const
Definition VPlan.h:4021
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4034
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2856
bool isSingleScalar() const
Definition VPlan.h:2901
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2925
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3675
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:522
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:587
virtual VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
operand_iterator op_end()
Definition VPlanValue.h:265
operand_iterator op_begin()
Definition VPlanValue.h:263
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:186
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1415
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1419
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1842
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1886
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3570
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1483
A recipe for handling GEP instructions.
Definition VPlan.h:1770
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2043
PHINode * getPHINode() const
Definition VPlan.h:2085
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2071
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2088
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2118
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2199
A recipe for widening vector intrinsics.
Definition VPlan.h:1541
A common base class for widening memory operations.
Definition VPlan.h:3154
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3216
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3209
A recipe for widened phis.
Definition VPlan.h:2254
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1440
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4076
bool hasVF(ElementCount VF) const
Definition VPlan.h:4285
LLVMContext & getContext() const
Definition VPlan.h:4273
VPBasicBlock * getEntry()
Definition VPlan.h:4175
VPRegionBlock * createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="", bool IsReplicator=false)
Create a new VPRegionBlock with Entry, Exiting and Name.
Definition VPlan.h:4416
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4265
bool hasScalableVF() const
Definition VPlan.h:4286
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4271
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4268
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4237
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4342
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4258
unsigned getUF() const
Definition VPlan.h:4305
bool hasUF(unsigned UF) const
Definition VPlan.h:4303
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4227
void setVF(ElementCount VF)
Definition VPlan.h:4279
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4318
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1049
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4251
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4200
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4406
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4348
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4327
bool hasScalarVFOnly() const
Definition VPlan.h:4296
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4218
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4357
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition VPlan.h:4381
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4223
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4354
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4180
void setUF(unsigned UF)
Definition VPlan.h:4310
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4458
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:257
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount, Op0_t, Op1_t > m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::BranchOnCond, Op0_t > m_BranchOnCond(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition VPlanUtils.h:44
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool isHeaderMask(const VPValue *V, VPlan &Plan)
Return true if V is a header mask in Plan.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:733
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:715
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:325
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1956
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2090
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:831
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2296
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3276
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3236
A recipe for widening select instructions.
Definition VPlan.h:1724
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3358
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3316
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.