LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlanPtr &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan->getVectorLoopRegion());
55 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
69 Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
95 Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 VPIRMetadata(*Store), Ingredient.getDebugLoc());
101 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
102 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
103 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
104 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
105 if (VectorID == Intrinsic::not_intrinsic)
106 return false;
107 NewRecipe = new VPWidenIntrinsicRecipe(
108 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
109 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
110 CI->getDebugLoc());
111 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
112 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
113 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
114 NewRecipe = new VPWidenCastRecipe(
115 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
116 } else {
117 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
118 }
119 }
120
121 NewRecipe->insertBefore(&Ingredient);
122 if (NewRecipe->getNumDefinedValues() == 1)
123 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
124 else
125 assert(NewRecipe->getNumDefinedValues() == 0 &&
126 "Only recpies with zero or one defined values expected");
127 Ingredient.eraseFromParent();
128 }
129 }
130 return true;
131}
132
133static bool sinkScalarOperands(VPlan &Plan) {
134 auto Iter = vp_depth_first_deep(Plan.getEntry());
135 bool Changed = false;
136 // First, collect the operands of all recipes in replicate blocks as seeds for
137 // sinking.
139 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
140 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
141 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
142 continue;
143 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
144 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
145 continue;
146 for (auto &Recipe : *VPBB) {
147 for (VPValue *Op : Recipe.operands())
148 if (auto *Def =
149 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
150 WorkList.insert({VPBB, Def});
151 }
152 }
153
154 bool ScalarVFOnly = Plan.hasScalarVFOnly();
155 // Try to sink each replicate or scalar IV steps recipe in the worklist.
156 for (unsigned I = 0; I != WorkList.size(); ++I) {
157 VPBasicBlock *SinkTo;
158 VPSingleDefRecipe *SinkCandidate;
159 std::tie(SinkTo, SinkCandidate) = WorkList[I];
160 if (SinkCandidate->getParent() == SinkTo ||
161 SinkCandidate->mayHaveSideEffects() ||
162 SinkCandidate->mayReadOrWriteMemory())
163 continue;
164 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
165 if (!ScalarVFOnly && RepR->isSingleScalar())
166 continue;
167 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
168 continue;
169
170 bool NeedsDuplicating = false;
171 // All recipe users of the sink candidate must be in the same block SinkTo
172 // or all users outside of SinkTo must be uniform-after-vectorization (
173 // i.e., only first lane is used) . In the latter case, we need to duplicate
174 // SinkCandidate.
175 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
176 SinkCandidate](VPUser *U) {
177 auto *UI = cast<VPRecipeBase>(U);
178 if (UI->getParent() == SinkTo)
179 return true;
180 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
181 // We only know how to duplicate VPReplicateRecipes and
182 // VPScalarIVStepsRecipes for now.
183 return NeedsDuplicating &&
184 isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate);
185 };
186 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
187 continue;
188
189 if (NeedsDuplicating) {
190 if (ScalarVFOnly)
191 continue;
192 VPSingleDefRecipe *Clone;
193 if (auto *SinkCandidateRepR =
194 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
195 // TODO: Handle converting to uniform recipes as separate transform,
196 // then cloning should be sufficient here.
197 Instruction *I = SinkCandidate->getUnderlyingInstr();
198 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
199 nullptr /*Mask*/, *SinkCandidateRepR);
200 // TODO: add ".cloned" suffix to name of Clone's VPValue.
201 } else {
202 Clone = SinkCandidate->clone();
203 }
204
205 Clone->insertBefore(SinkCandidate);
206 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
207 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
208 });
209 }
210 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
211 for (VPValue *Op : SinkCandidate->operands())
212 if (auto *Def =
213 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
214 WorkList.insert({SinkTo, Def});
215 Changed = true;
216 }
217 return Changed;
218}
219
220/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
221/// the mask.
223 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
224 if (!EntryBB || EntryBB->size() != 1 ||
225 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
226 return nullptr;
227
228 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
229}
230
231/// If \p R is a triangle region, return the 'then' block of the triangle.
233 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
234 if (EntryBB->getNumSuccessors() != 2)
235 return nullptr;
236
237 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
238 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
239 if (!Succ0 || !Succ1)
240 return nullptr;
241
242 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
243 return nullptr;
244 if (Succ0->getSingleSuccessor() == Succ1)
245 return Succ0;
246 if (Succ1->getSingleSuccessor() == Succ0)
247 return Succ1;
248 return nullptr;
249}
250
251// Merge replicate regions in their successor region, if a replicate region
252// is connected to a successor replicate region with the same predicate by a
253// single, empty VPBasicBlock.
255 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
256
257 // Collect replicate regions followed by an empty block, followed by another
258 // replicate region with matching masks to process front. This is to avoid
259 // iterator invalidation issues while merging regions.
261 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
262 vp_depth_first_deep(Plan.getEntry()))) {
263 if (!Region1->isReplicator())
264 continue;
265 auto *MiddleBasicBlock =
266 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
267 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
268 continue;
269
270 auto *Region2 =
271 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
272 if (!Region2 || !Region2->isReplicator())
273 continue;
274
275 VPValue *Mask1 = getPredicatedMask(Region1);
276 VPValue *Mask2 = getPredicatedMask(Region2);
277 if (!Mask1 || Mask1 != Mask2)
278 continue;
279
280 assert(Mask1 && Mask2 && "both region must have conditions");
281 WorkList.push_back(Region1);
282 }
283
284 // Move recipes from Region1 to its successor region, if both are triangles.
285 for (VPRegionBlock *Region1 : WorkList) {
286 if (TransformedRegions.contains(Region1))
287 continue;
288 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
289 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
290
291 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
292 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
293 if (!Then1 || !Then2)
294 continue;
295
296 // Note: No fusion-preventing memory dependencies are expected in either
297 // region. Such dependencies should be rejected during earlier dependence
298 // checks, which guarantee accesses can be re-ordered for vectorization.
299 //
300 // Move recipes to the successor region.
301 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
302 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
303
304 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
305 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
306
307 // Move VPPredInstPHIRecipes from the merge block to the successor region's
308 // merge block. Update all users inside the successor region to use the
309 // original values.
310 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
311 VPValue *PredInst1 =
312 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
313 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
314 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
315 return cast<VPRecipeBase>(&U)->getParent() == Then2;
316 });
317
318 // Remove phi recipes that are unused after merging the regions.
319 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
320 Phi1ToMove.eraseFromParent();
321 continue;
322 }
323 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
324 }
325
326 // Remove the dead recipes in Region1's entry block.
327 for (VPRecipeBase &R :
328 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
329 R.eraseFromParent();
330
331 // Finally, remove the first region.
332 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
333 VPBlockUtils::disconnectBlocks(Pred, Region1);
334 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
335 }
336 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
337 TransformedRegions.insert(Region1);
338 }
339
340 return !TransformedRegions.empty();
341}
342
344 VPlan &Plan) {
345 Instruction *Instr = PredRecipe->getUnderlyingInstr();
346 // Build the triangular if-then region.
347 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
348 assert(Instr->getParent() && "Predicated instruction not in any basic block");
349 auto *BlockInMask = PredRecipe->getMask();
350 auto *MaskDef = BlockInMask->getDefiningRecipe();
351 auto *BOMRecipe = new VPBranchOnMaskRecipe(
352 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
353 auto *Entry =
354 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
355
356 // Replace predicated replicate recipe with a replicate recipe without a
357 // mask but in the replicate region.
358 auto *RecipeWithoutMask = new VPReplicateRecipe(
359 PredRecipe->getUnderlyingInstr(),
360 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
361 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
362 auto *Pred =
363 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
364
365 VPPredInstPHIRecipe *PHIRecipe = nullptr;
366 if (PredRecipe->getNumUsers() != 0) {
367 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
368 RecipeWithoutMask->getDebugLoc());
369 PredRecipe->replaceAllUsesWith(PHIRecipe);
370 PHIRecipe->setOperand(0, RecipeWithoutMask);
371 }
372 PredRecipe->eraseFromParent();
373 auto *Exiting =
374 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
376 Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
377
378 // Note: first set Entry as region entry and then connect successors starting
379 // from it in order, to propagate the "parent" of each VPBasicBlock.
380 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
381 VPBlockUtils::connectBlocks(Pred, Exiting);
382
383 return Region;
384}
385
386static void addReplicateRegions(VPlan &Plan) {
388 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
389 vp_depth_first_deep(Plan.getEntry()))) {
390 for (VPRecipeBase &R : *VPBB)
391 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
392 if (RepR->isPredicated())
393 WorkList.push_back(RepR);
394 }
395 }
396
397 unsigned BBNum = 0;
398 for (VPReplicateRecipe *RepR : WorkList) {
399 VPBasicBlock *CurrentBlock = RepR->getParent();
400 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
401
402 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
404 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
405 // Record predicated instructions for above packing optimizations.
407 Region->setParent(CurrentBlock->getParent());
409
410 VPRegionBlock *ParentRegion = Region->getParent();
411 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
412 ParentRegion->setExiting(SplitBlock);
413 }
414}
415
416/// Remove redundant VPBasicBlocks by merging them into their predecessor if
417/// the predecessor has a single successor.
420 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
421 vp_depth_first_deep(Plan.getEntry()))) {
422 // Don't fold the blocks in the skeleton of the Plan into their single
423 // predecessors for now.
424 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
425 if (!VPBB->getParent())
426 continue;
427 auto *PredVPBB =
428 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
429 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
430 isa<VPIRBasicBlock>(PredVPBB))
431 continue;
432 WorkList.push_back(VPBB);
433 }
434
435 for (VPBasicBlock *VPBB : WorkList) {
436 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
437 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
438 R.moveBefore(*PredVPBB, PredVPBB->end());
439 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
440 auto *ParentRegion = VPBB->getParent();
441 if (ParentRegion && ParentRegion->getExiting() == VPBB)
442 ParentRegion->setExiting(PredVPBB);
443 for (auto *Succ : to_vector(VPBB->successors())) {
445 VPBlockUtils::connectBlocks(PredVPBB, Succ);
446 }
447 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
448 }
449 return !WorkList.empty();
450}
451
453 // Convert masked VPReplicateRecipes to if-then region blocks.
455
456 bool ShouldSimplify = true;
457 while (ShouldSimplify) {
458 ShouldSimplify = sinkScalarOperands(Plan);
459 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
460 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
461 }
462}
463
464/// Remove redundant casts of inductions.
465///
466/// Such redundant casts are casts of induction variables that can be ignored,
467/// because we already proved that the casted phi is equal to the uncasted phi
468/// in the vectorized loop. There is no need to vectorize the cast - the same
469/// value can be used for both the phi and casts in the vector loop.
471 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
472 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
473 if (!IV || IV->getTruncInst())
474 continue;
475
476 // A sequence of IR Casts has potentially been recorded for IV, which
477 // *must be bypassed* when the IV is vectorized, because the vectorized IV
478 // will produce the desired casted value. This sequence forms a def-use
479 // chain and is provided in reverse order, ending with the cast that uses
480 // the IV phi. Search for the recipe of the last cast in the chain and
481 // replace it with the original IV. Note that only the final cast is
482 // expected to have users outside the cast-chain and the dead casts left
483 // over will be cleaned up later.
484 auto &Casts = IV->getInductionDescriptor().getCastInsts();
485 VPValue *FindMyCast = IV;
486 for (Instruction *IRCast : reverse(Casts)) {
487 VPSingleDefRecipe *FoundUserCast = nullptr;
488 for (auto *U : FindMyCast->users()) {
489 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
490 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
491 FoundUserCast = UserCast;
492 break;
493 }
494 }
495 FindMyCast = FoundUserCast;
496 }
497 FindMyCast->replaceAllUsesWith(IV);
498 }
499}
500
501/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
502/// recipe, if it exists.
504 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
505 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
506 for (VPUser *U : CanonicalIV->users()) {
507 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
508 if (WidenNewIV)
509 break;
510 }
511
512 if (!WidenNewIV)
513 return;
514
516 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
517 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
518
519 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
520 continue;
521
522 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
523 // everything WidenNewIV's users need. That is, WidenOriginalIV will
524 // generate a vector phi or all users of WidenNewIV demand the first lane
525 // only.
526 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
527 vputils::onlyFirstLaneUsed(WidenNewIV)) {
528 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
529 WidenNewIV->eraseFromParent();
530 return;
531 }
532 }
533}
534
535/// Returns true if \p R is dead and can be removed.
536static bool isDeadRecipe(VPRecipeBase &R) {
537 // Do remove conditional assume instructions as their conditions may be
538 // flattened.
539 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
540 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
541 match(RepR, m_Intrinsic<Intrinsic::assume>());
542 if (IsConditionalAssume)
543 return true;
544
545 if (R.mayHaveSideEffects())
546 return false;
547
548 // Recipe is dead if no user keeps the recipe alive.
549 return all_of(R.definedValues(),
550 [](VPValue *V) { return V->getNumUsers() == 0; });
551}
552
554 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
555 vp_post_order_deep(Plan.getEntry()))) {
556 // The recipes in the block are processed in reverse order, to catch chains
557 // of dead recipes.
558 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
559 if (isDeadRecipe(R)) {
560 R.eraseFromParent();
561 continue;
562 }
563
564 // Check if R is a dead VPPhi <-> update cycle and remove it.
565 auto *PhiR = dyn_cast<VPPhi>(&R);
566 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
567 continue;
568 VPValue *Incoming = PhiR->getOperand(1);
569 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
570 Incoming->getNumUsers() != 1)
571 continue;
572 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
573 PhiR->eraseFromParent();
574 Incoming->getDefiningRecipe()->eraseFromParent();
575 }
576 }
577}
578
581 Instruction::BinaryOps InductionOpcode,
582 FPMathOperator *FPBinOp, Instruction *TruncI,
583 VPValue *StartV, VPValue *Step, DebugLoc DL,
584 VPBuilder &Builder) {
586 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
587 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
588 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
589
590 // Truncate base induction if needed.
591 VPTypeAnalysis TypeInfo(Plan);
592 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
593 if (TruncI) {
594 Type *TruncTy = TruncI->getType();
595 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
596 "Not truncating.");
597 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
598 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
599 ResultTy = TruncTy;
600 }
601
602 // Truncate step if needed.
603 Type *StepTy = TypeInfo.inferScalarType(Step);
604 if (ResultTy != StepTy) {
605 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
606 "Not truncating.");
607 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
608 auto *VecPreheader =
609 cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());
610 VPBuilder::InsertPointGuard Guard(Builder);
611 Builder.setInsertPoint(VecPreheader);
612 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
613 }
614 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
615 &Plan.getVF(), DL);
616}
617
620 for (unsigned I = 0; I != Users.size(); ++I) {
621 VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);
622 if (isa<VPHeaderPHIRecipe>(Cur))
623 continue;
624 for (VPValue *V : Cur->definedValues())
625 Users.insert_range(V->users());
626 }
627 return Users.takeVector();
628}
629
630/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
631/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
632/// VPWidenPointerInductionRecipe will generate vectors only. If some users
633/// require vectors while other require scalars, the scalar uses need to extract
634/// the scalars from the generated vectors (Note that this is different to how
635/// int/fp inductions are handled). Legalize extract-from-ends using uniform
636/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
637/// the correct end value is available. Also optimize
638/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
639/// providing them scalar steps built on the canonical scalar IV and update the
640/// original IV's users. This is an optional optimization to reduce the needs of
641/// vector extracts.
644 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
645 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
646 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
647 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
648 if (!PhiR)
649 continue;
650
651 // Try to narrow wide and replicating recipes to uniform recipes, based on
652 // VPlan analysis.
653 // TODO: Apply to all recipes in the future, to replace legacy uniformity
654 // analysis.
655 auto Users = collectUsersRecursively(PhiR);
656 for (VPUser *U : reverse(Users)) {
657 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
658 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
659 // Skip recipes that shouldn't be narrowed.
660 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
661 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
662 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
663 continue;
664
665 // Skip recipes that may have other lanes than their first used.
667 continue;
668
669 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
670 Def->operands(), /*IsUniform*/ true);
671 Clone->insertAfter(Def);
672 Def->replaceAllUsesWith(Clone);
673 }
674
675 // Replace wide pointer inductions which have only their scalars used by
676 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
677 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
678 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
679 continue;
680
681 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
682 VPValue *StartV =
683 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
684 VPValue *StepV = PtrIV->getOperand(1);
686 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
687 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
688
689 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
690 PtrIV->getDebugLoc(), "next.gep");
691
692 PtrIV->replaceAllUsesWith(PtrAdd);
693 continue;
694 }
695
696 // Replace widened induction with scalar steps for users that only use
697 // scalars.
698 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
699 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
700 return U->usesScalars(WideIV);
701 }))
702 continue;
703
704 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
706 Plan, ID.getKind(), ID.getInductionOpcode(),
707 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
708 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
709 WideIV->getDebugLoc(), Builder);
710
711 // Update scalar users of IV to use Step instead.
712 if (!HasOnlyVectorVFs)
713 WideIV->replaceAllUsesWith(Steps);
714 else
715 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
716 return U.usesScalars(WideIV);
717 });
718 }
719}
720
721/// Check if \p VPV is an untruncated wide induction, either before or after the
722/// increment. If so return the header IV (before the increment), otherwise
723/// return null.
725 ScalarEvolution &SE) {
726 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
727 if (WideIV) {
728 // VPV itself is a wide induction, separately compute the end value for exit
729 // users if it is not a truncated IV.
730 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
731 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
732 }
733
734 // Check if VPV is an optimizable induction increment.
735 VPRecipeBase *Def = VPV->getDefiningRecipe();
736 if (!Def || Def->getNumOperands() != 2)
737 return nullptr;
738 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
739 if (!WideIV)
740 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
741 if (!WideIV)
742 return nullptr;
743
744 auto IsWideIVInc = [&]() {
745 auto &ID = WideIV->getInductionDescriptor();
746
747 // Check if VPV increments the induction by the induction step.
748 VPValue *IVStep = WideIV->getStepValue();
749 switch (ID.getInductionOpcode()) {
750 case Instruction::Add:
751 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
752 case Instruction::FAdd:
753 return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
754 m_Specific(IVStep)));
755 case Instruction::FSub:
756 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
757 m_Specific(IVStep)));
758 case Instruction::Sub: {
759 // IVStep will be the negated step of the subtraction. Check if Step == -1
760 // * IVStep.
761 VPValue *Step;
762 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
763 return false;
764 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
765 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
766 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
767 !isa<SCEVCouldNotCompute>(StepSCEV) &&
768 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
769 }
770 default:
771 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
772 match(VPV, m_GetElementPtr(m_Specific(WideIV),
773 m_Specific(WideIV->getStepValue())));
774 }
775 llvm_unreachable("should have been covered by switch above");
776 };
777 return IsWideIVInc() ? WideIV : nullptr;
778}
779
780/// Attempts to optimize the induction variable exit values for users in the
781/// early exit block.
783 VPTypeAnalysis &TypeInfo,
784 VPBlockBase *PredVPBB,
785 VPValue *Op,
786 ScalarEvolution &SE) {
787 VPValue *Incoming, *Mask;
788 if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
789 m_VPInstruction<VPInstruction::FirstActiveLane>(
790 m_VPValue(Mask)),
792 return nullptr;
793
794 auto *WideIV = getOptimizableIVOf(Incoming, SE);
795 if (!WideIV)
796 return nullptr;
797
798 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
799 if (WideIntOrFp && WideIntOrFp->getTruncInst())
800 return nullptr;
801
802 // Calculate the final index.
803 VPValue *EndValue = Plan.getCanonicalIV();
804 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
805 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
806
807 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
808 VPValue *FirstActiveLane =
809 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
810 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
811 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
812 FirstActiveLaneType, DL);
813 EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
814
815 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
816 // changed it means the exit is using the incremented value, so we need to
817 // add the step.
818 if (Incoming != WideIV) {
819 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
820 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
821 }
822
823 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
824 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
825 VPValue *Start = WideIV->getStartValue();
826 VPValue *Step = WideIV->getStepValue();
827 EndValue = B.createDerivedIV(
828 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
829 Start, EndValue, Step);
830 }
831
832 return EndValue;
833}
834
835/// Attempts to optimize the induction variable exit values for users in the
836/// exit block coming from the latch in the original scalar loop.
838 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
842 return nullptr;
843
844 auto *WideIV = getOptimizableIVOf(Incoming, SE);
845 if (!WideIV)
846 return nullptr;
847
848 VPValue *EndValue = EndValues.lookup(WideIV);
849 assert(EndValue && "end value must have been pre-computed");
850
851 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
852 // changed it means the exit is using the incremented value, so we don't
853 // need to subtract the step.
854 if (Incoming != WideIV)
855 return EndValue;
856
857 // Otherwise, subtract the step from the EndValue.
858 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
859 VPValue *Step = WideIV->getStepValue();
860 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
861 if (ScalarTy->isIntegerTy())
862 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
863 if (ScalarTy->isPointerTy()) {
864 Type *StepTy = TypeInfo.inferScalarType(Step);
865 auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
866 return B.createPtrAdd(EndValue,
867 B.createNaryOp(Instruction::Sub, {Zero, Step}), {},
868 "ind.escape");
869 }
870 if (ScalarTy->isFloatingPointTy()) {
871 const auto &ID = WideIV->getInductionDescriptor();
872 return B.createNaryOp(
873 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
874 ? Instruction::FSub
875 : Instruction::FAdd,
876 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
877 }
878 llvm_unreachable("all possible induction types must be handled");
879 return nullptr;
880}
881
883 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
884 ScalarEvolution &SE) {
885 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
886 VPTypeAnalysis TypeInfo(Plan);
887 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
888 for (VPRecipeBase &R : ExitVPBB->phis()) {
889 auto *ExitIRI = cast<VPIRPhi>(&R);
890
891 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
892 VPValue *Escape = nullptr;
893 if (PredVPBB == MiddleVPBB)
894 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
895 ExitIRI->getOperand(Idx),
896 EndValues, SE);
897 else
898 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
899 ExitIRI->getOperand(Idx), SE);
900 if (Escape)
901 ExitIRI->setOperand(Idx, Escape);
902 }
903 }
904 }
905}
906
907/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
908/// them with already existing recipes expanding the same SCEV expression.
911
912 for (VPRecipeBase &R :
914 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
915 if (!ExpR)
916 continue;
917
918 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
919 if (Inserted)
920 continue;
921 ExpR->replaceAllUsesWith(V->second);
922 ExpR->eraseFromParent();
923 }
924}
925
927 SmallVector<VPValue *> WorkList;
929 WorkList.push_back(V);
930
931 while (!WorkList.empty()) {
932 VPValue *Cur = WorkList.pop_back_val();
933 if (!Seen.insert(Cur).second)
934 continue;
936 if (!R)
937 continue;
938 if (!isDeadRecipe(*R))
939 continue;
940 WorkList.append(R->op_begin(), R->op_end());
941 R->eraseFromParent();
942 }
943}
944
945/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
946/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
947/// foldable live-ins.
948static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
950 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
952 for (VPValue *Op : Operands) {
953 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
954 return nullptr;
955 Ops.push_back(Op->getLiveInIRValue());
956 }
957
958 InstSimplifyFolder Folder(DL);
959 if (Instruction::isBinaryOp(Opcode))
960 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
961 Ops[1]);
962 if (Instruction::isCast(Opcode))
963 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
964 TypeInfo.inferScalarType(R.getVPSingleValue()));
965 switch (Opcode) {
967 return Folder.FoldSelect(Ops[0], Ops[1],
970 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
972 case Instruction::Select:
973 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
974 case Instruction::ICmp:
975 case Instruction::FCmp:
976 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
977 Ops[1]);
978 case Instruction::GetElementPtr: {
979 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
980 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
981 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
982 RFlags.getGEPNoWrapFlags());
983 }
986 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
987 Ops[1],
988 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
989 // An extract of a live-in is an extract of a broadcast, so return the
990 // broadcasted element.
991 case Instruction::ExtractElement:
992 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
993 return Ops[0];
994 }
995 return nullptr;
996}
997
998/// Try to simplify recipe \p R.
999static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
1000 VPlan *Plan = R.getParent()->getPlan();
1001
1002 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1003 if (!Def)
1004 return;
1005
1006 // Simplification of live-in IR values for SingleDef recipes using
1007 // InstSimplifyFolder.
1011 const DataLayout &DL =
1012 Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1013 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1014 TypeInfo);
1015 if (V)
1016 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1017 return V;
1018 })
1019 .Default([](auto *) { return false; }))
1020 return;
1021
1022 // Fold PredPHI LiveIn -> LiveIn.
1023 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1024 VPValue *Op = PredPHI->getOperand(0);
1025 if (Op->isLiveIn())
1026 PredPHI->replaceAllUsesWith(Op);
1027 }
1028
1029 VPValue *A;
1030 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1031 Type *TruncTy = TypeInfo.inferScalarType(Def);
1032 Type *ATy = TypeInfo.inferScalarType(A);
1033 if (TruncTy == ATy) {
1034 Def->replaceAllUsesWith(A);
1035 } else {
1036 // Don't replace a scalarizing recipe with a widened cast.
1037 if (isa<VPReplicateRecipe>(Def))
1038 return;
1039 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1040
1041 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1042 ? Instruction::SExt
1043 : Instruction::ZExt;
1044 auto *VPC =
1045 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1046 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1047 // UnderlyingExt has distinct return type, used to retain legacy cost.
1048 VPC->setUnderlyingValue(UnderlyingExt);
1049 }
1050 VPC->insertBefore(&R);
1051 Def->replaceAllUsesWith(VPC);
1052 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1053 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1054 VPC->insertBefore(&R);
1055 Def->replaceAllUsesWith(VPC);
1056 }
1057 }
1058#ifndef NDEBUG
1059 // Verify that the cached type info is for both A and its users is still
1060 // accurate by comparing it to freshly computed types.
1061 VPTypeAnalysis TypeInfo2(*Plan);
1062 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1063 for (VPUser *U : A->users()) {
1064 auto *R = cast<VPRecipeBase>(U);
1065 for (VPValue *VPV : R->definedValues())
1066 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1067 }
1068#endif
1069 }
1070
1071 // Simplify (X && Y) || (X && !Y) -> X.
1072 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1073 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1074 // recipes to be visited during simplification.
1075 VPValue *X, *Y, *Z;
1076 if (match(Def,
1079 Def->replaceAllUsesWith(X);
1080 Def->eraseFromParent();
1081 return;
1082 }
1083
1084 // OR x, 1 -> 1.
1085 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1086 Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)
1087 : Def->getOperand(0));
1088 Def->eraseFromParent();
1089 return;
1090 }
1091
1092 // AND x, 0 -> 0
1093 if (match(&R, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1094 Def->replaceAllUsesWith(R.getOperand(0) == X ? R.getOperand(1)
1095 : R.getOperand(0));
1096 return;
1097 }
1098
1099 // (x && y) || (x && z) -> x && (y || z)
1100 VPBuilder Builder(Def);
1103 // Simplify only if one of the operands has one use to avoid creating an
1104 // extra recipe.
1105 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1106 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1107 return Def->replaceAllUsesWith(
1108 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1109
1110 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1111 return Def->replaceAllUsesWith(X);
1112
1113 // select !c, x, y -> select c, y, x
1114 VPValue *C;
1115 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1116 Def->setOperand(0, C);
1117 Def->setOperand(1, Y);
1118 Def->setOperand(2, X);
1119 return;
1120 }
1121
1122 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1123 // tail folding it is likely that x is a header mask and can be simplified
1124 // further.
1126 m_VPValue(Z))) &&
1127 X->hasMoreThanOneUniqueUser())
1128 return Def->replaceAllUsesWith(
1129 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1130
1131 if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
1132 return Def->replaceAllUsesWith(A);
1133
1134 if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))
1135 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1136 : R.getOperand(0));
1137
1138 if (match(Def, m_Not(m_VPValue(A)))) {
1139 if (match(A, m_Not(m_VPValue(A))))
1140 return Def->replaceAllUsesWith(A);
1141
1142 // Try to fold Not into compares by adjusting the predicate in-place.
1143 CmpPredicate Pred;
1144 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1145 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1146 if (all_of(Cmp->users(), [&Cmp](VPUser *U) {
1147 return match(U, m_CombineOr(m_Not(m_Specific(Cmp)),
1148 m_Select(m_Specific(Cmp), m_VPValue(),
1149 m_VPValue())));
1150 })) {
1151 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1152 for (VPUser *U : to_vector(Cmp->users())) {
1153 auto *R = cast<VPSingleDefRecipe>(U);
1154 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1155 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1156 R->setOperand(1, Y);
1157 R->setOperand(2, X);
1158 } else {
1159 // not (cmp pred) -> cmp inv_pred
1160 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1161 R->replaceAllUsesWith(Cmp);
1162 }
1163 }
1164 // If Cmp doesn't have a debug location, use the one from the negation,
1165 // to preserve the location.
1166 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1167 Cmp->setDebugLoc(R.getDebugLoc());
1168 }
1169 }
1170 }
1171
1172 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1173 if ((match(Def,
1175 match(Def,
1177 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1178 TypeInfo.inferScalarType(Def))
1179 return Def->replaceAllUsesWith(Def->getOperand(1));
1180
1181 if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(
1182 m_VPValue(X), m_SpecificInt(1)))) {
1183 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1184 if (TypeInfo.inferScalarType(X) != WideStepTy)
1185 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1186 Def->replaceAllUsesWith(X);
1187 return;
1188 }
1189
1190 // For i1 vp.merges produced by AnyOf reductions:
1191 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1192 if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
1193 m_VPValue(X), m_VPValue())) &&
1195 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1196 Def->setOperand(1, Def->getOperand(0));
1197 Def->setOperand(0, Y);
1198 return;
1199 }
1200
1201 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1202 if (Phi->getOperand(0) == Phi->getOperand(1))
1203 Def->replaceAllUsesWith(Phi->getOperand(0));
1204 return;
1205 }
1206
1207 // Look through ExtractLastElement (BuildVector ....).
1209 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1210 Def->replaceAllUsesWith(
1211 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1212 return;
1213 }
1214
1215 // Look through ExtractPenultimateElement (BuildVector ....).
1216 if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1217 m_BuildVector()))) {
1218 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1219 Def->replaceAllUsesWith(
1220 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1221 return;
1222 }
1223
1224 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1225 if (Phi->getNumOperands() == 1)
1226 Phi->replaceAllUsesWith(Phi->getOperand(0));
1227 return;
1228 }
1229
1230 // Some simplifications can only be applied after unrolling. Perform them
1231 // below.
1232 if (!Plan->isUnrolled())
1233 return;
1234
1235 // VPVectorPointer for part 0 can be replaced by their start pointer.
1236 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1237 if (VecPtr->isFirstPart()) {
1238 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1239 return;
1240 }
1241 }
1242
1243 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1244 // the first lane is demanded.
1245 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1246 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1247 Steps->replaceAllUsesWith(Steps->getOperand(0));
1248 return;
1249 }
1250 }
1251 // Simplify redundant ReductionStartVector recipes after unrolling.
1252 VPValue *StartV;
1253 if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(
1254 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1255 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1256 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1257 return PhiR && PhiR->isInLoop();
1258 });
1259 return;
1260 }
1261
1263 Def->replaceAllUsesWith(A);
1264 return;
1265 }
1266
1267 VPInstruction *OpVPI;
1268 if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) &&
1269 OpVPI->isVectorToScalar()) {
1270 Def->replaceAllUsesWith(OpVPI);
1271 return;
1272 }
1273}
1274
1277 Plan.getEntry());
1278 VPTypeAnalysis TypeInfo(Plan);
1279 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
1280 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1281 simplifyRecipe(R, TypeInfo);
1282 }
1283 }
1284}
1285
1287 if (Plan.hasScalarVFOnly())
1288 return;
1289
1290 // Try to narrow wide and replicating recipes to single scalar recipes,
1291 // based on VPlan analysis. Only process blocks in the loop region for now,
1292 // without traversing into nested regions, as recipes in replicate regions
1293 // cannot be converted yet.
1294 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1296 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1297 if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
1298 continue;
1299 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1300 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1301 continue;
1302
1303 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1304 // Skip recipes that aren't single scalars or don't have only their
1305 // scalar results used. In the latter case, we would introduce extra
1306 // broadcasts.
1307 if (!vputils::isSingleScalar(RepOrWidenR) ||
1308 !vputils::onlyScalarValuesUsed(RepOrWidenR))
1309 continue;
1310
1311 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1312 RepOrWidenR->operands(),
1313 true /*IsSingleScalar*/);
1314 Clone->insertBefore(RepOrWidenR);
1315 RepOrWidenR->replaceAllUsesWith(Clone);
1316 }
1317 }
1318}
1319
1320/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1321/// to make sure the masks are simplified.
1322static void simplifyBlends(VPlan &Plan) {
1323 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1325 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1326 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1327 if (!Blend)
1328 continue;
1329
1330 // Try to remove redundant blend recipes.
1331 SmallPtrSet<VPValue *, 4> UniqueValues;
1332 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1333 UniqueValues.insert(Blend->getIncomingValue(0));
1334 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1335 if (!match(Blend->getMask(I), m_False()))
1336 UniqueValues.insert(Blend->getIncomingValue(I));
1337
1338 if (UniqueValues.size() == 1) {
1339 Blend->replaceAllUsesWith(*UniqueValues.begin());
1340 Blend->eraseFromParent();
1341 continue;
1342 }
1343
1344 if (Blend->isNormalized())
1345 continue;
1346
1347 // Normalize the blend so its first incoming value is used as the initial
1348 // value with the others blended into it.
1349
1350 unsigned StartIndex = 0;
1351 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1352 // If a value's mask is used only by the blend then is can be deadcoded.
1353 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1354 // that's used by multiple blends where it can be removed from them all.
1355 VPValue *Mask = Blend->getMask(I);
1356 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1357 StartIndex = I;
1358 break;
1359 }
1360 }
1361
1362 SmallVector<VPValue *, 4> OperandsWithMask;
1363 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1364
1365 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1366 if (I == StartIndex)
1367 continue;
1368 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1369 OperandsWithMask.push_back(Blend->getMask(I));
1370 }
1371
1372 auto *NewBlend =
1373 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1374 OperandsWithMask, Blend->getDebugLoc());
1375 NewBlend->insertBefore(&R);
1376
1377 VPValue *DeadMask = Blend->getMask(StartIndex);
1378 Blend->replaceAllUsesWith(NewBlend);
1379 Blend->eraseFromParent();
1381
1382 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1383 VPValue *NewMask;
1384 if (NewBlend->getNumOperands() == 3 &&
1385 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1386 VPValue *Inc0 = NewBlend->getOperand(0);
1387 VPValue *Inc1 = NewBlend->getOperand(1);
1388 VPValue *OldMask = NewBlend->getOperand(2);
1389 NewBlend->setOperand(0, Inc1);
1390 NewBlend->setOperand(1, Inc0);
1391 NewBlend->setOperand(2, NewMask);
1392 if (OldMask->getNumUsers() == 0)
1393 cast<VPInstruction>(OldMask)->eraseFromParent();
1394 }
1395 }
1396 }
1397}
1398
1399/// Optimize the width of vector induction variables in \p Plan based on a known
1400/// constant Trip Count, \p BestVF and \p BestUF.
1402 ElementCount BestVF,
1403 unsigned BestUF) {
1404 // Only proceed if we have not completely removed the vector region.
1405 if (!Plan.getVectorLoopRegion())
1406 return false;
1407
1408 if (!Plan.getTripCount()->isLiveIn())
1409 return false;
1410 auto *TC = dyn_cast_if_present<ConstantInt>(
1412 if (!TC || !BestVF.isFixed())
1413 return false;
1414
1415 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1416 // and UF. Returns at least 8.
1417 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1418 APInt AlignedTC =
1421 APInt MaxVal = AlignedTC - 1;
1422 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1423 };
1424 unsigned NewBitWidth =
1425 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1426
1427 LLVMContext &Ctx = Plan.getContext();
1428 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1429
1430 bool MadeChange = false;
1431
1432 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1433 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1434 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1435
1436 // Currently only handle canonical IVs as it is trivial to replace the start
1437 // and stop values, and we currently only perform the optimization when the
1438 // IV has a single use.
1439 if (!WideIV || !WideIV->isCanonical() ||
1440 WideIV->hasMoreThanOneUniqueUser() ||
1441 NewIVTy == WideIV->getScalarType())
1442 continue;
1443
1444 // Currently only handle cases where the single user is a header-mask
1445 // comparison with the backedge-taken-count.
1446 if (!match(*WideIV->user_begin(),
1447 m_ICmp(m_Specific(WideIV),
1450 continue;
1451
1452 // Update IV operands and comparison bound to use new narrower type.
1453 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1454 WideIV->setStartValue(NewStart);
1455 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1456 WideIV->setStepValue(NewStep);
1457
1458 auto *NewBTC = new VPWidenCastRecipe(
1459 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1460 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1461 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1462 Cmp->setOperand(1, NewBTC);
1463
1464 MadeChange = true;
1465 }
1466
1467 return MadeChange;
1468}
1469
1470/// Return true if \p Cond is known to be true for given \p BestVF and \p
1471/// BestUF.
1473 ElementCount BestVF, unsigned BestUF,
1474 ScalarEvolution &SE) {
1476 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1477 &SE](VPValue *C) {
1478 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1479 });
1480
1481 auto *CanIV = Plan.getCanonicalIV();
1483 m_Specific(CanIV->getBackedgeValue()),
1484 m_Specific(&Plan.getVectorTripCount()))))
1485 return false;
1486
1487 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1488 // count is not conveniently available as SCEV so far, so we compare directly
1489 // against the original trip count. This is stricter than necessary, as we
1490 // will only return true if the trip count == vector trip count.
1491 const SCEV *VectorTripCount =
1493 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1494 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1495 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1496 "Trip count SCEV must be computable");
1497 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1498 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1499 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1500}
1501
1502/// Try to replace multiple active lane masks used for control flow with
1503/// a single, wide active lane mask instruction followed by multiple
1504/// extract subvector intrinsics. This applies to the active lane mask
1505/// instructions both in the loop and in the preheader.
1506/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1507/// new extracts from the first active lane mask, which has it's last
1508/// operand (multiplier) set to UF.
1510 unsigned UF) {
1511 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1512 return false;
1513
1514 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1515 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1516 auto *Term = &ExitingVPBB->back();
1517
1518 using namespace llvm::VPlanPatternMatch;
1520 m_VPValue(), m_VPValue(), m_VPValue())))))
1521 return false;
1522
1523 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1524 LLVMContext &Ctx = Plan.getContext();
1525
1526 auto ExtractFromALM = [&](VPInstruction *ALM,
1527 SmallVectorImpl<VPValue *> &Extracts) {
1528 DebugLoc DL = ALM->getDebugLoc();
1529 for (unsigned Part = 0; Part < UF; ++Part) {
1531 Ops.append({ALM, Plan.getOrAddLiveIn(
1532 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1533 VF.getKnownMinValue() * Part))});
1534 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1536 Extracts[Part] = Ext;
1537 Ext->insertAfter(ALM);
1538 }
1539 };
1540
1541 // Create a list of each active lane mask phi, ordered by unroll part.
1543 for (VPRecipeBase &R : Header->phis()) {
1544 auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
1545 if (!Phi)
1546 continue;
1547 VPValue *Index = nullptr;
1548 match(Phi->getBackedgeValue(),
1550 assert(Index && "Expected index from ActiveLaneMask instruction");
1551
1552 auto *II = dyn_cast<VPInstruction>(Index);
1553 if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
1554 auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue());
1555 Phis[Part->getZExtValue()] = Phi;
1556 } else
1557 // Anything other than a CanonicalIVIncrementForPart is part 0
1558 Phis[0] = Phi;
1559 }
1560
1561 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1562 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1563
1564 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1565 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1566
1567 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1568 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1569 "Expected incoming values of Phi to be ActiveLaneMasks");
1570
1571 // When using wide lane masks, the return type of the get.active.lane.mask
1572 // intrinsic is VF x UF (last operand).
1573 VPValue *ALMMultiplier =
1574 Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
1575 EntryALM->setOperand(2, ALMMultiplier);
1576 LoopALM->setOperand(2, ALMMultiplier);
1577
1578 // Create UF x extract vectors and insert into preheader.
1579 SmallVector<VPValue *> EntryExtracts(UF);
1580 ExtractFromALM(EntryALM, EntryExtracts);
1581
1582 // Create UF x extract vectors and insert before the loop compare & branch,
1583 // updating the compare to use the first extract.
1584 SmallVector<VPValue *> LoopExtracts(UF);
1585 ExtractFromALM(LoopALM, LoopExtracts);
1586 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1587 Not->setOperand(0, LoopExtracts[0]);
1588
1589 // Update the incoming values of active lane mask phis.
1590 for (unsigned Part = 0; Part < UF; ++Part) {
1591 Phis[Part]->setStartValue(EntryExtracts[Part]);
1592 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1593 }
1594
1595 return true;
1596}
1597
1598/// Try to simplify the branch condition of \p Plan. This may restrict the
1599/// resulting plan to \p BestVF and \p BestUF.
1601 unsigned BestUF,
1603 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1604 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1605 auto *Term = &ExitingVPBB->back();
1606 VPValue *Cond;
1607 ScalarEvolution &SE = *PSE.getSE();
1608 if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1610 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1611 // Try to simplify the branch condition if TC <= VF * UF when the latch
1612 // terminator is BranchOnCount or BranchOnCond where the input is
1613 // Not(ActiveLaneMask).
1614 const SCEV *TripCount =
1616 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1617 "Trip count SCEV must be computable");
1618 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1619 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1620 if (TripCount->isZero() ||
1621 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1622 return false;
1623 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1624 // For BranchOnCond, check if we can prove the condition to be true using VF
1625 // and UF.
1626 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1627 return false;
1628 } else {
1629 return false;
1630 }
1631
1632 // The vector loop region only executes once. If possible, completely remove
1633 // the region, otherwise replace the terminator controlling the latch with
1634 // (BranchOnCond true).
1635 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1636 // support for other non-canonical widen induction recipes (e.g.,
1637 // VPWidenPointerInductionRecipe).
1638 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1639 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1640 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1641 return R->isCanonical();
1642 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1643 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1644 })) {
1645 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1646 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1647 VPBuilder Builder(Plan.getVectorPreheader());
1648 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1649 R->getScalarType());
1650 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1651 HeaderR.eraseFromParent();
1652 continue;
1653 }
1654 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1655 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1656 HeaderR.eraseFromParent();
1657 }
1658
1659 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1660 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1661 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1662 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1663
1664 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1665 B->setParent(nullptr);
1666
1667 VPBlockUtils::connectBlocks(Preheader, Header);
1668 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1670 } else {
1671 // The vector region contains header phis for which we cannot remove the
1672 // loop region yet.
1673 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1674 Term->getDebugLoc());
1675 ExitingVPBB->appendRecipe(BOC);
1676 }
1677
1678 Term->eraseFromParent();
1679
1680 return true;
1681}
1682
1684 unsigned BestUF,
1686 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1687 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1688
1689 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1690 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1691 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1692
1693 if (MadeChange) {
1694 Plan.setVF(BestVF);
1695 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1696 }
1697 // TODO: Further simplifications are possible
1698 // 1. Replace inductions with constants.
1699 // 2. Replace vector loop region with VPBasicBlock.
1700}
1701
1702/// Sink users of \p FOR after the recipe defining the previous value \p
1703/// Previous of the recurrence. \returns true if all users of \p FOR could be
1704/// re-arranged as needed or false if it is not possible.
1705static bool
1707 VPRecipeBase *Previous,
1708 VPDominatorTree &VPDT) {
1709 // Collect recipes that need sinking.
1712 Seen.insert(Previous);
1713 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1714 // The previous value must not depend on the users of the recurrence phi. In
1715 // that case, FOR is not a fixed order recurrence.
1716 if (SinkCandidate == Previous)
1717 return false;
1718
1719 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1720 !Seen.insert(SinkCandidate).second ||
1721 VPDT.properlyDominates(Previous, SinkCandidate))
1722 return true;
1723
1724 if (SinkCandidate->mayHaveSideEffects())
1725 return false;
1726
1727 WorkList.push_back(SinkCandidate);
1728 return true;
1729 };
1730
1731 // Recursively sink users of FOR after Previous.
1732 WorkList.push_back(FOR);
1733 for (unsigned I = 0; I != WorkList.size(); ++I) {
1734 VPRecipeBase *Current = WorkList[I];
1735 assert(Current->getNumDefinedValues() == 1 &&
1736 "only recipes with a single defined value expected");
1737
1738 for (VPUser *User : Current->getVPSingleValue()->users()) {
1739 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1740 return false;
1741 }
1742 }
1743
1744 // Keep recipes to sink ordered by dominance so earlier instructions are
1745 // processed first.
1746 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1747 return VPDT.properlyDominates(A, B);
1748 });
1749
1750 for (VPRecipeBase *SinkCandidate : WorkList) {
1751 if (SinkCandidate == FOR)
1752 continue;
1753
1754 SinkCandidate->moveAfter(Previous);
1755 Previous = SinkCandidate;
1756 }
1757 return true;
1758}
1759
1760/// Try to hoist \p Previous and its operands before all users of \p FOR.
1762 VPRecipeBase *Previous,
1763 VPDominatorTree &VPDT) {
1764 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1765 return false;
1766
1767 // Collect recipes that need hoisting.
1768 SmallVector<VPRecipeBase *> HoistCandidates;
1770 VPRecipeBase *HoistPoint = nullptr;
1771 // Find the closest hoist point by looking at all users of FOR and selecting
1772 // the recipe dominating all other users.
1773 for (VPUser *U : FOR->users()) {
1774 auto *R = cast<VPRecipeBase>(U);
1775 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1776 HoistPoint = R;
1777 }
1778 assert(all_of(FOR->users(),
1779 [&VPDT, HoistPoint](VPUser *U) {
1780 auto *R = cast<VPRecipeBase>(U);
1781 return HoistPoint == R ||
1782 VPDT.properlyDominates(HoistPoint, R);
1783 }) &&
1784 "HoistPoint must dominate all users of FOR");
1785
1786 auto NeedsHoisting = [HoistPoint, &VPDT,
1787 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1788 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1789 if (!HoistCandidate)
1790 return nullptr;
1791 VPRegionBlock *EnclosingLoopRegion =
1792 HoistCandidate->getParent()->getEnclosingLoopRegion();
1793 assert((!HoistCandidate->getParent()->getParent() ||
1794 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1795 "CFG in VPlan should still be flat, without replicate regions");
1796 // Hoist candidate was already visited, no need to hoist.
1797 if (!Visited.insert(HoistCandidate).second)
1798 return nullptr;
1799
1800 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1801 // hoisting.
1802 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1803 return nullptr;
1804
1805 // If we reached a recipe that dominates HoistPoint, we don't need to
1806 // hoist the recipe.
1807 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1808 return nullptr;
1809 return HoistCandidate;
1810 };
1811 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1812 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1813 // associated dependencies.
1814 return !HoistCandidate->mayHaveSideEffects();
1815 };
1816
1817 if (!NeedsHoisting(Previous->getVPSingleValue()))
1818 return true;
1819
1820 // Recursively try to hoist Previous and its operands before all users of FOR.
1821 HoistCandidates.push_back(Previous);
1822
1823 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1824 VPRecipeBase *Current = HoistCandidates[I];
1825 assert(Current->getNumDefinedValues() == 1 &&
1826 "only recipes with a single defined value expected");
1827 if (!CanHoist(Current))
1828 return false;
1829
1830 for (VPValue *Op : Current->operands()) {
1831 // If we reach FOR, it means the original Previous depends on some other
1832 // recurrence that in turn depends on FOR. If that is the case, we would
1833 // also need to hoist recipes involving the other FOR, which may break
1834 // dependencies.
1835 if (Op == FOR)
1836 return false;
1837
1838 if (auto *R = NeedsHoisting(Op))
1839 HoistCandidates.push_back(R);
1840 }
1841 }
1842
1843 // Order recipes to hoist by dominance so earlier instructions are processed
1844 // first.
1845 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1846 return VPDT.properlyDominates(A, B);
1847 });
1848
1849 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1850 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1851 HoistPoint->getIterator());
1852 }
1853
1854 return true;
1855}
1856
1858 VPBuilder &LoopBuilder) {
1859 VPDominatorTree VPDT;
1860 VPDT.recalculate(Plan);
1861
1863 for (VPRecipeBase &R :
1865 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
1866 RecurrencePhis.push_back(FOR);
1867
1868 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1870 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1871 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1872 // to terminate.
1873 while (auto *PrevPhi =
1874 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
1875 assert(PrevPhi->getParent() == FOR->getParent());
1876 assert(SeenPhis.insert(PrevPhi).second);
1877 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1878 }
1879
1880 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1881 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1882 return false;
1883
1884 // Introduce a recipe to combine the incoming and previous values of a
1885 // fixed-order recurrence.
1886 VPBasicBlock *InsertBlock = Previous->getParent();
1887 if (isa<VPHeaderPHIRecipe>(Previous))
1888 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1889 else
1890 LoopBuilder.setInsertPoint(InsertBlock,
1891 std::next(Previous->getIterator()));
1892
1893 auto *RecurSplice =
1895 {FOR, FOR->getBackedgeValue()});
1896
1897 FOR->replaceAllUsesWith(RecurSplice);
1898 // Set the first operand of RecurSplice to FOR again, after replacing
1899 // all users.
1900 RecurSplice->setOperand(0, FOR);
1901 }
1902 return true;
1903}
1904
1906 for (VPRecipeBase &R :
1908 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1909 if (!PhiR)
1910 continue;
1911 RecurKind RK = PhiR->getRecurrenceKind();
1912 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
1914 continue;
1915
1916 for (VPUser *U : collectUsersRecursively(PhiR))
1917 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1918 RecWithFlags->dropPoisonGeneratingFlags();
1919 }
1920 }
1921}
1922
1923namespace {
1924struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1925 static bool isSentinel(const VPSingleDefRecipe *Def) {
1926 return Def == getEmptyKey() || Def == getTombstoneKey();
1927 }
1928
1929 /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1930 /// Returns an optional pair, where the first element indicates whether it is
1931 /// an intrinsic ID.
1932 static std::optional<std::pair<bool, unsigned>>
1933 getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1934 return TypeSwitch<const VPSingleDefRecipe *,
1935 std::optional<std::pair<bool, unsigned>>>(R)
1938 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1939 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1940 return std::make_pair(true, I->getVectorIntrinsicID());
1941 })
1942 .Default([](auto *) { return std::nullopt; });
1943 }
1944
1945 /// Returns true if recipe \p Def can be safely handed for CSE.
1946 static bool canHandle(const VPSingleDefRecipe *Def) {
1947 // We can extend the list of handled recipes in the future,
1948 // provided we account for the data embedded in them while checking for
1949 // equality or hashing.
1950 auto C = getOpcodeOrIntrinsicID(Def);
1951
1952 // The issue with (Insert|Extract)Value is that the index of the
1953 // insert/extract is not a proper operand in LLVM IR, and hence also not in
1954 // VPlan.
1955 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
1956 C->second == Instruction::ExtractValue)))
1957 return false;
1958
1959 // During CSE, we can only handle recipes that don't read from memory: if
1960 // they read from memory, there could be an intervening write to memory
1961 // before the next instance is CSE'd, leading to an incorrect result.
1962 return !Def->mayReadFromMemory();
1963 }
1964
1965 /// Hash the underlying data of \p Def.
1966 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
1967 const VPlan *Plan = Def->getParent()->getPlan();
1968 VPTypeAnalysis TypeInfo(*Plan);
1970 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
1971 TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
1972 hash_combine_range(Def->operands()));
1973 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
1974 if (RFlags->hasPredicate())
1975 return hash_combine(Result, RFlags->getPredicate());
1976 return Result;
1977 }
1978
1979 /// Check equality of underlying data of \p L and \p R.
1980 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
1981 if (isSentinel(L) || isSentinel(R))
1982 return L == R;
1983 if (L->getVPDefID() != R->getVPDefID() ||
1984 getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
1986 !equal(L->operands(), R->operands()))
1987 return false;
1988 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
1989 if (LFlags->hasPredicate() &&
1990 LFlags->getPredicate() !=
1991 cast<VPRecipeWithIRFlags>(R)->getPredicate())
1992 return false;
1993 const VPlan *Plan = L->getParent()->getPlan();
1994 VPTypeAnalysis TypeInfo(*Plan);
1995 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
1996 }
1997};
1998} // end anonymous namespace
1999
2000/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2001/// Plan.
2003 VPDominatorTree VPDT(Plan);
2005
2006 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2007 vp_depth_first_deep(Plan.getEntry()))) {
2008 for (VPRecipeBase &R : *VPBB) {
2009 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2010 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2011 continue;
2012 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2013 // V must dominate Def for a valid replacement.
2014 if (!VPDT.dominates(V->getParent(), VPBB))
2015 continue;
2016 // Drop poison-generating flags when reusing a value.
2017 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2018 RFlags->dropPoisonGeneratingFlags();
2019 Def->replaceAllUsesWith(V);
2020 continue;
2021 }
2022 CSEMap[Def] = Def;
2023 }
2024 }
2025}
2026
2027/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2028static void licm(VPlan &Plan) {
2029 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2030
2031 // Return true if we do not know how to (mechanically) hoist a given recipe
2032 // out of a loop region. Does not address legality concerns such as aliasing
2033 // or speculation safety.
2034 auto CannotHoistRecipe = [](VPRecipeBase &R) {
2035 // Allocas cannot be hoisted.
2036 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2037 return RepR && RepR->getOpcode() == Instruction::Alloca;
2038 };
2039
2040 // Hoist any loop invariant recipes from the vector loop region to the
2041 // preheader. Preform a shallow traversal of the vector loop region, to
2042 // exclude recipes in replicate regions.
2043 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2044 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2045 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2046 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2047 if (CannotHoistRecipe(R))
2048 continue;
2049 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
2050 // their memory location is not modified in the vector loop.
2051 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
2052 any_of(R.operands(), [](VPValue *Op) {
2053 return !Op->isDefinedOutsideLoopRegions();
2054 }))
2055 continue;
2056 R.moveBefore(*Preheader, Preheader->end());
2057 }
2058 }
2059}
2060
2062 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2063 // Keep track of created truncates, so they can be re-used. Note that we
2064 // cannot use RAUW after creating a new truncate, as this would could make
2065 // other uses have different types for their operands, making them invalidly
2066 // typed.
2068 VPTypeAnalysis TypeInfo(Plan);
2069 VPBasicBlock *PH = Plan.getVectorPreheader();
2070 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2072 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2075 &R))
2076 continue;
2077
2078 VPValue *ResultVPV = R.getVPSingleValue();
2079 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2080 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2081 if (!NewResSizeInBits)
2082 continue;
2083
2084 // If the value wasn't vectorized, we must maintain the original scalar
2085 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2086 // skip casts which do not need to be handled explicitly here, as
2087 // redundant casts will be removed during recipe simplification.
2088 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))
2089 continue;
2090
2091 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2092 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2093 assert(OldResTy->isIntegerTy() && "only integer types supported");
2094 (void)OldResSizeInBits;
2095
2096 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2097
2098 // Any wrapping introduced by shrinking this operation shouldn't be
2099 // considered undefined behavior. So, we can't unconditionally copy
2100 // arithmetic wrapping flags to VPW.
2101 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2102 VPW->dropPoisonGeneratingFlags();
2103
2104 if (OldResSizeInBits != NewResSizeInBits &&
2105 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2106 // Extend result to original width.
2107 auto *Ext =
2108 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2109 Ext->insertAfter(&R);
2110 ResultVPV->replaceAllUsesWith(Ext);
2111 Ext->setOperand(0, ResultVPV);
2112 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2113 } else {
2114 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2115 "Only ICmps should not need extending the result.");
2116 }
2117
2118 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2119 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R))
2120 continue;
2121
2122 // Shrink operands by introducing truncates as needed.
2123 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2124 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2125 auto *Op = R.getOperand(Idx);
2126 unsigned OpSizeInBits =
2128 if (OpSizeInBits == NewResSizeInBits)
2129 continue;
2130 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2131 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2132 VPWidenCastRecipe *NewOp =
2133 IterIsEmpty
2134 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
2135 : ProcessedIter->second;
2136 R.setOperand(Idx, NewOp);
2137 if (!IterIsEmpty)
2138 continue;
2139 ProcessedIter->second = NewOp;
2140 if (!Op->isLiveIn()) {
2141 NewOp->insertBefore(&R);
2142 } else {
2143 PH->appendRecipe(NewOp);
2144 }
2145 }
2146
2147 }
2148 }
2149}
2150
2152 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2154 VPValue *Cond;
2155 if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
2156 !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2157 continue;
2158
2159 unsigned RemovedIdx;
2160 if (match(Cond, m_True()))
2161 RemovedIdx = 1;
2162 else if (match(Cond, m_False()))
2163 RemovedIdx = 0;
2164 else
2165 continue;
2166
2167 VPBasicBlock *RemovedSucc =
2168 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2169 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2170 "There must be a single edge between VPBB and its successor");
2171 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2172 // these recipes.
2173 for (VPRecipeBase &R : RemovedSucc->phis())
2174 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2175
2176 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2177 // automatically on VPlan destruction if it becomes unreachable.
2178 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2179 VPBB->back().eraseFromParent();
2180 }
2181}
2182
2186
2187 runPass(simplifyRecipes, Plan);
2188 runPass(simplifyBlends, Plan);
2193 runPass(simplifyRecipes, Plan);
2196
2199 runPass(licm, Plan);
2200}
2201
2202// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2203// the loop terminator with a branch-on-cond recipe with the negated
2204// active-lane-mask as operand. Note that this turns the loop into an
2205// uncountable one. Only the existing terminator is replaced, all other existing
2206// recipes/users remain unchanged, except for poison-generating flags being
2207// dropped from the canonical IV increment. Return the created
2208// VPActiveLaneMaskPHIRecipe.
2209//
2210// The function uses the following definitions:
2211//
2212// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2213// calculate-trip-count-minus-VF (original TC) : original TC
2214// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2215// CanonicalIVPhi : CanonicalIVIncrement
2216// %StartV is the canonical induction start value.
2217//
2218// The function adds the following recipes:
2219//
2220// vector.ph:
2221// %TripCount = calculate-trip-count-minus-VF (original TC)
2222// [if DataWithControlFlowWithoutRuntimeCheck]
2223// %EntryInc = canonical-iv-increment-for-part %StartV
2224// %EntryALM = active-lane-mask %EntryInc, %TripCount
2225//
2226// vector.body:
2227// ...
2228// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2229// ...
2230// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2231// %ALM = active-lane-mask %InLoopInc, TripCount
2232// %Negated = Not %ALM
2233// branch-on-cond %Negated
2234//
2237 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2238 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2239 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2240 VPValue *StartV = CanonicalIVPHI->getStartValue();
2241
2242 auto *CanonicalIVIncrement =
2243 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2244 // TODO: Check if dropping the flags is needed if
2245 // !DataAndControlFlowWithoutRuntimeCheck.
2246 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2247 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2248 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2249 // we have to take unrolling into account. Each part needs to start at
2250 // Part * VF
2251 auto *VecPreheader = Plan.getVectorPreheader();
2252 VPBuilder Builder(VecPreheader);
2253
2254 // Create the ActiveLaneMask instruction using the correct start values.
2255 VPValue *TC = Plan.getTripCount();
2256
2257 VPValue *TripCount, *IncrementValue;
2259 // When the loop is guarded by a runtime overflow check for the loop
2260 // induction variable increment by VF, we can increment the value before
2261 // the get.active.lane mask and use the unmodified tripcount.
2262 IncrementValue = CanonicalIVIncrement;
2263 TripCount = TC;
2264 } else {
2265 // When avoiding a runtime check, the active.lane.mask inside the loop
2266 // uses a modified trip count and the induction variable increment is
2267 // done after the active.lane.mask intrinsic is called.
2268 IncrementValue = CanonicalIVPHI;
2270 {TC}, DL);
2271 }
2272 auto *EntryIncrement = Builder.createOverflowingOp(
2273 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2274 "index.part.next");
2275
2276 // Create the active lane mask instruction in the VPlan preheader.
2277 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2278 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2279 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2280 {EntryIncrement, TC, ALMMultiplier}, DL,
2281 "active.lane.mask.entry");
2282
2283 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2284 // preheader ActiveLaneMask instruction.
2285 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
2286 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2287
2288 // Create the active lane mask for the next iteration of the loop before the
2289 // original terminator.
2290 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2291 Builder.setInsertPoint(OriginalTerminator);
2292 auto *InLoopIncrement =
2294 {IncrementValue}, {false, false}, DL);
2295 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2296 {InLoopIncrement, TripCount, ALMMultiplier},
2297 DL, "active.lane.mask.next");
2298 LaneMaskPhi->addOperand(ALM);
2299
2300 // Replace the original terminator with BranchOnCond. We have to invert the
2301 // mask here because a true condition means jumping to the exit block.
2302 auto *NotMask = Builder.createNot(ALM, DL);
2303 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2304 OriginalTerminator->eraseFromParent();
2305 return LaneMaskPhi;
2306}
2307
2308/// Collect the header mask with the pattern:
2309/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2310/// TODO: Introduce explicit recipe for header-mask instead of searching
2311/// for the header-mask pattern manually.
2313 SmallVector<VPValue *> WideCanonicalIVs;
2314 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2315 IsaPred<VPWidenCanonicalIVRecipe>);
2317 IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
2318 "Must have at most one VPWideCanonicalIVRecipe");
2319 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2320 auto *WideCanonicalIV =
2321 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2322 WideCanonicalIVs.push_back(WideCanonicalIV);
2323 }
2324
2325 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2326 // version of the canonical induction.
2327 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2328 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2329 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2330 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2331 WideCanonicalIVs.push_back(WidenOriginalIV);
2332 }
2333
2334 // Walk users of wide canonical IVs and find the single compare of the form
2335 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2336 VPSingleDefRecipe *HeaderMask = nullptr;
2337 for (auto *Wide : WideCanonicalIVs) {
2338 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2339 auto *VPI = dyn_cast<VPInstruction>(U);
2340 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2341 continue;
2342
2343 assert(VPI->getOperand(0) == Wide &&
2344 "WidenCanonicalIV must be the first operand of the compare");
2345 assert(!HeaderMask && "Multiple header masks found?");
2346 HeaderMask = VPI;
2347 }
2348 }
2349 return HeaderMask;
2350}
2351
2353 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2356 UseActiveLaneMaskForControlFlow) &&
2357 "DataAndControlFlowWithoutRuntimeCheck implies "
2358 "UseActiveLaneMaskForControlFlow");
2359
2360 auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
2361 IsaPred<VPWidenCanonicalIVRecipe>);
2362 assert(FoundWidenCanonicalIVUser &&
2363 "Must have widened canonical IV when tail folding!");
2364 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2365 auto *WideCanonicalIV =
2366 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2367 VPSingleDefRecipe *LaneMask;
2368 if (UseActiveLaneMaskForControlFlow) {
2371 } else {
2372 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2373 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2374 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2375 LaneMask =
2376 B.createNaryOp(VPInstruction::ActiveLaneMask,
2377 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2378 nullptr, "active.lane.mask");
2379 }
2380
2381 // Walk users of WideCanonicalIV and replace the header mask of the form
2382 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2383 // removing the old one to ensure there is always only a single header mask.
2384 HeaderMask->replaceAllUsesWith(LaneMask);
2385 HeaderMask->eraseFromParent();
2386}
2387
2388/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2389/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2390/// recipe could be created.
2391/// \p HeaderMask Header Mask.
2392/// \p CurRecipe Recipe to be transform.
2393/// \p TypeInfo VPlan-based type analysis.
2394/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2395/// \p EVL The explicit vector length parameter of vector-predication
2396/// intrinsics.
2398 VPRecipeBase &CurRecipe,
2399 VPTypeAnalysis &TypeInfo,
2400 VPValue &AllOneMask, VPValue &EVL) {
2401 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2402 // header mask.
2403 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2404 assert(OrigMask && "Unmasked recipe when folding tail");
2405 // HeaderMask will be handled using EVL.
2406 VPValue *Mask;
2407 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2408 return Mask;
2409 return HeaderMask == OrigMask ? nullptr : OrigMask;
2410 };
2411
2412 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2413 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2414 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2415 if (!EndPtr)
2416 return Addr;
2417 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2418 "VPVectorEndPointerRecipe with non-VF VF operand?");
2419 assert(
2420 all_of(EndPtr->users(),
2421 [](VPUser *U) {
2422 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2423 }) &&
2424 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2425 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2426 EVLAddr->insertBefore(&CurRecipe);
2427 EVLAddr->setOperand(1, &EVL);
2428 return EVLAddr;
2429 };
2430
2433 VPValue *NewMask = GetNewMask(L->getMask());
2434 VPValue *NewAddr = GetNewAddr(L->getAddr());
2435 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2436 })
2437 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2438 VPValue *NewMask = GetNewMask(S->getMask());
2439 VPValue *NewAddr = GetNewAddr(S->getAddr());
2440 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2441 })
2442 .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2443 VPValue *NewMask = GetNewMask(IR->getMask());
2444 return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2445 })
2446 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2447 VPValue *NewMask = GetNewMask(Red->getCondOp());
2448 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2449 })
2450 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2451 VPValue *LHS, *RHS;
2452 // Transform select with a header mask condition
2453 // select(header_mask, LHS, RHS)
2454 // into vector predication merge.
2455 // vp.merge(all-true, LHS, RHS, EVL)
2456 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2457 m_VPValue(RHS))))
2458 return nullptr;
2459 // Use all true as the condition because this transformation is
2460 // limited to selects whose condition is a header mask.
2461 return new VPWidenIntrinsicRecipe(
2462 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2463 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2464 })
2465 .Default([&](VPRecipeBase *R) { return nullptr; });
2466}
2467
2468/// Replace recipes with their EVL variants.
2470 VPTypeAnalysis TypeInfo(Plan);
2471 VPValue *AllOneMask = Plan.getTrue();
2472 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2473 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2474
2475 assert(all_of(Plan.getVF().users(),
2478 "User of VF that we can't transform to EVL.");
2479 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2480 return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
2481 });
2482
2483 assert(all_of(Plan.getVFxUF().users(),
2484 [&Plan](VPUser *U) {
2485 return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),
2486 m_Specific(&Plan.getVFxUF()))) ||
2487 isa<VPWidenPointerInductionRecipe>(U);
2488 }) &&
2489 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2490 "increment of the canonical induction.");
2491 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2492 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2493 // canonical induction must not be updated.
2494 return isa<VPWidenPointerInductionRecipe>(U);
2495 });
2496
2497 // Defer erasing recipes till the end so that we don't invalidate the
2498 // VPTypeAnalysis cache.
2500
2501 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2502 // contained.
2503 bool ContainsFORs =
2504 any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2505 if (ContainsFORs) {
2506 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2507 VPValue *MaxEVL = &Plan.getVF();
2508 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2509 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2510 MaxEVL = Builder.createScalarZExtOrTrunc(
2511 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2512 TypeInfo.inferScalarType(MaxEVL), DebugLoc());
2513
2514 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2515 VPValue *PrevEVL =
2516 Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
2517
2518 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2520 for (VPRecipeBase &R : *VPBB) {
2521 VPValue *V1, *V2;
2522 if (!match(&R,
2523 m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
2524 m_VPValue(V1), m_VPValue(V2))))
2525 continue;
2526 VPValue *Imm = Plan.getOrAddLiveIn(
2529 Intrinsic::experimental_vp_splice,
2530 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2531 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2532 VPSplice->insertBefore(&R);
2533 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2534 ToErase.push_back(&R);
2535 }
2536 }
2537 }
2538
2539 VPValue *HeaderMask = findHeaderMask(Plan);
2540 if (!HeaderMask)
2541 return;
2542
2543 // Replace header masks with a mask equivalent to predicating by EVL:
2544 //
2545 // icmp ule widen-canonical-iv backedge-taken-count
2546 // ->
2547 // icmp ult step-vector, EVL
2548 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2549 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2550 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2551 VPValue *EVLMask = Builder.createICmp(
2553 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2554 HeaderMask->replaceAllUsesWith(EVLMask);
2555 ToErase.push_back(HeaderMask->getDefiningRecipe());
2556
2557 // Try to optimize header mask recipes away to their EVL variants.
2558 // TODO: Split optimizeMaskToEVL out and move into
2559 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2560 // tryToBuildVPlanWithVPRecipes beforehand.
2561 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2562 auto *CurRecipe = cast<VPRecipeBase>(U);
2563 VPRecipeBase *EVLRecipe =
2564 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2565 if (!EVLRecipe)
2566 continue;
2567
2568 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2569 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2570 "New recipe must define the same number of values as the "
2571 "original.");
2572 EVLRecipe->insertBefore(CurRecipe);
2573 if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
2574 EVLRecipe)) {
2575 for (unsigned I = 0; I < NumDefVal; ++I) {
2576 VPValue *CurVPV = CurRecipe->getVPValue(I);
2577 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2578 }
2579 }
2580 ToErase.push_back(CurRecipe);
2581 }
2582 // Remove dead EVL mask.
2583 if (EVLMask->getNumUsers() == 0)
2584 ToErase.push_back(EVLMask->getDefiningRecipe());
2585
2586 for (VPRecipeBase *R : reverse(ToErase)) {
2587 SmallVector<VPValue *> PossiblyDead(R->operands());
2588 R->eraseFromParent();
2589 for (VPValue *Op : PossiblyDead)
2591 }
2592}
2593
2594/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2595/// replaces all uses except the canonical IV increment of
2596/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2597/// is used only for loop iterations counting after this transformation.
2598///
2599/// The function uses the following definitions:
2600/// %StartV is the canonical induction start value.
2601///
2602/// The function adds the following recipes:
2603///
2604/// vector.ph:
2605/// ...
2606///
2607/// vector.body:
2608/// ...
2609/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2610/// [ %NextEVLIV, %vector.body ]
2611/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2612/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2613/// ...
2614/// %OpEVL = cast i32 %VPEVL to IVSize
2615/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2616/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2617/// ...
2618///
2619/// If MaxSafeElements is provided, the function adds the following recipes:
2620/// vector.ph:
2621/// ...
2622///
2623/// vector.body:
2624/// ...
2625/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2626/// [ %NextEVLIV, %vector.body ]
2627/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2628/// %cmp = cmp ult %AVL, MaxSafeElements
2629/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2630/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2631/// ...
2632/// %OpEVL = cast i32 %VPEVL to IVSize
2633/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2634/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2635/// ...
2636///
2638 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2640
2641 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2642 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2643 VPValue *StartV = CanonicalIVPHI->getStartValue();
2644
2645 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2646 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
2647 EVLPhi->insertAfter(CanonicalIVPHI);
2648 VPBuilder Builder(Header, Header->getFirstNonPhi());
2649 // Create the AVL (application vector length), starting from TC -> 0 in steps
2650 // of EVL.
2651 VPPhi *AVLPhi = Builder.createScalarPhi(
2652 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2653 VPValue *AVL = AVLPhi;
2654
2655 if (MaxSafeElements) {
2656 // Support for MaxSafeDist for correct loop emission.
2657 VPValue *AVLSafe =
2658 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2659 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2660 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
2661 }
2662 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2663 DebugLoc());
2664
2665 auto *CanonicalIVIncrement =
2666 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2667 Builder.setInsertPoint(CanonicalIVIncrement);
2668 VPValue *OpVPEVL = VPEVL;
2669
2670 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2671 OpVPEVL = Builder.createScalarZExtOrTrunc(
2672 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2673
2674 auto *NextEVLIV = Builder.createOverflowingOp(
2675 Instruction::Add, {OpVPEVL, EVLPhi},
2676 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2677 CanonicalIVIncrement->hasNoSignedWrap()},
2678 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2679 EVLPhi->addOperand(NextEVLIV);
2680
2681 VPValue *NextAVL = Builder.createOverflowingOp(
2682 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2683 DebugLoc::getCompilerGenerated(), "avl.next");
2684 AVLPhi->addOperand(NextAVL);
2685
2686 transformRecipestoEVLRecipes(Plan, *VPEVL);
2687
2688 // Replace all uses of VPCanonicalIVPHIRecipe by
2689 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2690 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2691 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2692 // TODO: support unroll factor > 1.
2693 Plan.setUF(1);
2694}
2695
2697 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2698 // There should be only one EVL PHI in the entire plan.
2699 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2700
2701 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2703 for (VPRecipeBase &R : VPBB->phis())
2704 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2705 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2706 EVLPhi = PhiR;
2707 }
2708
2709 // Early return if no EVL PHI is found.
2710 if (!EVLPhi)
2711 return;
2712
2713 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2714 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2715 VPValue *AVL;
2716 [[maybe_unused]] bool FoundAVL =
2717 match(EVLIncrement,
2718 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2719 assert(FoundAVL && "Didn't find AVL?");
2720
2721 // The AVL may be capped to a safe distance.
2722 VPValue *SafeAVL;
2723 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2724 AVL = SafeAVL;
2725
2726 VPValue *AVLNext;
2727 [[maybe_unused]] bool FoundAVLNext =
2728 match(AVL, m_VPInstruction<Instruction::PHI>(
2729 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2730 assert(FoundAVLNext && "Didn't find AVL backedge?");
2731
2732 // Convert EVLPhi to concrete recipe.
2733 auto *ScalarR =
2734 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2735 EVLPhi->getDebugLoc(), "evl.based.iv");
2736 EVLPhi->replaceAllUsesWith(ScalarR);
2737 EVLPhi->eraseFromParent();
2738
2739 // Replace CanonicalIVInc with EVL-PHI increment.
2740 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2741 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2742 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2743 m_Specific(&Plan.getVFxUF()))) &&
2744 "Unexpected canonical iv");
2745 Backedge->replaceAllUsesWith(EVLIncrement);
2746
2747 // Remove unused phi and increment.
2748 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2749 CanonicalIVIncrement->eraseFromParent();
2750 CanonicalIV->eraseFromParent();
2751
2752 // Replace the use of VectorTripCount in the latch-exiting block.
2753 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2754 // After: (branch-on-cond eq AVLNext, 0)
2755
2756 VPBasicBlock *LatchExiting =
2757 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2758 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2759 // Skip single-iteration loop region
2760 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2761 return;
2762 assert(LatchExitingBr &&
2763 match(LatchExitingBr,
2764 m_BranchOnCount(m_VPValue(EVLIncrement),
2765 m_Specific(&Plan.getVectorTripCount()))) &&
2766 "Unexpected terminator in EVL loop");
2767
2768 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2769 VPBuilder Builder(LatchExitingBr);
2770 VPValue *Cmp =
2771 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2774 LatchExitingBr->eraseFromParent();
2775}
2776
2778 VPlan &Plan, PredicatedScalarEvolution &PSE,
2779 const DenseMap<Value *, const SCEV *> &StridesMap) {
2780 // Replace VPValues for known constant strides guaranteed by predicate scalar
2781 // evolution.
2782 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2783 auto *R = cast<VPRecipeBase>(&U);
2784 return R->getParent()->getParent() ||
2785 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2786 };
2787 for (const SCEV *Stride : StridesMap.values()) {
2788 using namespace SCEVPatternMatch;
2789 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2790 const APInt *StrideConst;
2791 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2792 // Only handle constant strides for now.
2793 continue;
2794
2795 auto *CI =
2796 Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
2797 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2798 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2799
2800 // The versioned value may not be used in the loop directly but through a
2801 // sext/zext. Add new live-ins in those cases.
2802 for (Value *U : StrideV->users()) {
2803 if (!isa<SExtInst, ZExtInst>(U))
2804 continue;
2805 VPValue *StrideVPV = Plan.getLiveIn(U);
2806 if (!StrideVPV)
2807 continue;
2808 unsigned BW = U->getType()->getScalarSizeInBits();
2809 APInt C =
2810 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2811 VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
2812 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2813 }
2814 }
2815}
2816
2818 VPlan &Plan,
2819 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2820 // Collect recipes in the backward slice of `Root` that may generate a poison
2821 // value that is used after vectorization.
2823 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2825 Worklist.push_back(Root);
2826
2827 // Traverse the backward slice of Root through its use-def chain.
2828 while (!Worklist.empty()) {
2829 VPRecipeBase *CurRec = Worklist.pop_back_val();
2830
2831 if (!Visited.insert(CurRec).second)
2832 continue;
2833
2834 // Prune search if we find another recipe generating a widen memory
2835 // instruction. Widen memory instructions involved in address computation
2836 // will lead to gather/scatter instructions, which don't need to be
2837 // handled.
2838 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2839 VPHeaderPHIRecipe>(CurRec))
2840 continue;
2841
2842 // This recipe contributes to the address computation of a widen
2843 // load/store. If the underlying instruction has poison-generating flags,
2844 // drop them directly.
2845 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2846 VPValue *A, *B;
2847 // Dropping disjoint from an OR may yield incorrect results, as some
2848 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2849 // for dependence analysis). Instead, replace it with an equivalent Add.
2850 // This is possible as all users of the disjoint OR only access lanes
2851 // where the operands are disjoint or poison otherwise.
2852 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
2853 RecWithFlags->isDisjoint()) {
2854 VPBuilder Builder(RecWithFlags);
2855 VPInstruction *New = Builder.createOverflowingOp(
2856 Instruction::Add, {A, B}, {false, false},
2857 RecWithFlags->getDebugLoc());
2858 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2859 RecWithFlags->replaceAllUsesWith(New);
2860 RecWithFlags->eraseFromParent();
2861 CurRec = New;
2862 } else
2863 RecWithFlags->dropPoisonGeneratingFlags();
2864 } else {
2865 Instruction *Instr = dyn_cast_or_null<Instruction>(
2866 CurRec->getVPSingleValue()->getUnderlyingValue());
2867 (void)Instr;
2868 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2869 "found instruction with poison generating flags not covered by "
2870 "VPRecipeWithIRFlags");
2871 }
2872
2873 // Add new definitions to the worklist.
2874 for (VPValue *Operand : CurRec->operands())
2875 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2876 Worklist.push_back(OpDef);
2877 }
2878 });
2879
2880 // Traverse all the recipes in the VPlan and collect the poison-generating
2881 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2882 // VPInterleaveRecipe.
2883 auto Iter = vp_depth_first_deep(Plan.getEntry());
2884 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
2885 for (VPRecipeBase &Recipe : *VPBB) {
2886 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
2887 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2888 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2889 if (AddrDef && WidenRec->isConsecutive() &&
2890 BlockNeedsPredication(UnderlyingInstr.getParent()))
2891 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2892 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
2893 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2894 if (AddrDef) {
2895 // Check if any member of the interleave group needs predication.
2896 const InterleaveGroup<Instruction> *InterGroup =
2897 InterleaveRec->getInterleaveGroup();
2898 bool NeedPredication = false;
2899 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2900 I < NumMembers; ++I) {
2901 Instruction *Member = InterGroup->getMember(I);
2902 if (Member)
2903 NeedPredication |= BlockNeedsPredication(Member->getParent());
2904 }
2905
2906 if (NeedPredication)
2907 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2908 }
2909 }
2910 }
2911 }
2912}
2913
2915 VPlan &Plan,
2917 &InterleaveGroups,
2918 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2919 if (InterleaveGroups.empty())
2920 return;
2921
2922 // Interleave memory: for each Interleave Group we marked earlier as relevant
2923 // for this VPlan, replace the Recipes widening its memory instructions with a
2924 // single VPInterleaveRecipe at its insertion point.
2925 VPDominatorTree VPDT;
2926 VPDT.recalculate(Plan);
2927 for (const auto *IG : InterleaveGroups) {
2928 auto *Start =
2929 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
2930 VPIRMetadata InterleaveMD(*Start);
2931 SmallVector<VPValue *, 4> StoredValues;
2932 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
2933 StoredValues.push_back(StoreR->getStoredValue());
2934 for (unsigned I = 1; I < IG->getFactor(); ++I) {
2935 Instruction *MemberI = IG->getMember(I);
2936 if (!MemberI)
2937 continue;
2938 VPWidenMemoryRecipe *MemoryR =
2939 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
2940 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
2941 StoredValues.push_back(StoreR->getStoredValue());
2942 InterleaveMD.intersect(*MemoryR);
2943 }
2944
2945 bool NeedsMaskForGaps =
2946 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
2947 (!StoredValues.empty() && !IG->isFull());
2948
2949 Instruction *IRInsertPos = IG->getInsertPos();
2950 auto *InsertPos =
2951 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
2952
2954 if (auto *Gep = dyn_cast<GetElementPtrInst>(
2955 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
2956 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
2957
2958 // Get or create the start address for the interleave group.
2959 VPValue *Addr = Start->getAddr();
2960 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2961 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
2962 // We cannot re-use the address of member zero because it does not
2963 // dominate the insert position. Instead, use the address of the insert
2964 // position and create a PtrAdd adjusting it to the address of member
2965 // zero.
2966 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2967 // InsertPos or sink loads above zero members to join it.
2968 assert(IG->getIndex(IRInsertPos) != 0 &&
2969 "index of insert position shouldn't be zero");
2970 auto &DL = IRInsertPos->getDataLayout();
2971 APInt Offset(32,
2972 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
2973 IG->getIndex(IRInsertPos),
2974 /*IsSigned=*/true);
2975 VPValue *OffsetVPV =
2976 Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
2977 VPBuilder B(InsertPos);
2978 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
2979 }
2980 // If the group is reverse, adjust the index to refer to the last vector
2981 // lane instead of the first. We adjust the index from the first vector
2982 // lane, rather than directly getting the pointer for lane VF - 1, because
2983 // the pointer operand of the interleaved access is supposed to be uniform.
2984 if (IG->isReverse()) {
2985 auto *ReversePtr = new VPVectorEndPointerRecipe(
2986 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
2987 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
2988 ReversePtr->insertBefore(InsertPos);
2989 Addr = ReversePtr;
2990 }
2991 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
2992 InsertPos->getMask(), NeedsMaskForGaps,
2993 InterleaveMD, InsertPos->getDebugLoc());
2994 VPIG->insertBefore(InsertPos);
2995
2996 unsigned J = 0;
2997 for (unsigned i = 0; i < IG->getFactor(); ++i)
2998 if (Instruction *Member = IG->getMember(i)) {
2999 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3000 if (!Member->getType()->isVoidTy()) {
3001 VPValue *OriginalV = MemberR->getVPSingleValue();
3002 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3003 J++;
3004 }
3005 MemberR->eraseFromParent();
3006 }
3007 }
3008}
3009
3010/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3011/// value, phi and backedge value. In the following example:
3012///
3013/// vector.ph:
3014/// Successor(s): vector loop
3015///
3016/// <x1> vector loop: {
3017/// vector.body:
3018/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3019/// ...
3020/// EMIT branch-on-count ...
3021/// No successors
3022/// }
3023///
3024/// WIDEN-INDUCTION will get expanded to:
3025///
3026/// vector.ph:
3027/// ...
3028/// vp<%induction.start> = ...
3029/// vp<%induction.increment> = ...
3030///
3031/// Successor(s): vector loop
3032///
3033/// <x1> vector loop: {
3034/// vector.body:
3035/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3036/// ...
3037/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3038/// EMIT branch-on-count ...
3039/// No successors
3040/// }
3041static void
3043 VPTypeAnalysis &TypeInfo) {
3044 VPlan *Plan = WidenIVR->getParent()->getPlan();
3045 VPValue *Start = WidenIVR->getStartValue();
3046 VPValue *Step = WidenIVR->getStepValue();
3047 VPValue *VF = WidenIVR->getVFValue();
3048 DebugLoc DL = WidenIVR->getDebugLoc();
3049
3050 // The value from the original loop to which we are mapping the new induction
3051 // variable.
3052 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3053
3054 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3057 // FIXME: The newly created binary instructions should contain nsw/nuw
3058 // flags, which can be found from the original scalar operations.
3059 VPIRFlags Flags;
3060 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3061 AddOp = Instruction::Add;
3062 MulOp = Instruction::Mul;
3063 } else {
3064 AddOp = ID.getInductionOpcode();
3065 MulOp = Instruction::FMul;
3066 Flags = ID.getInductionBinOp()->getFastMathFlags();
3067 }
3068
3069 // If the phi is truncated, truncate the start and step values.
3070 VPBuilder Builder(Plan->getVectorPreheader());
3071 Type *StepTy = TypeInfo.inferScalarType(Step);
3072 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3073 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3074 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3075 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3076 StepTy = Ty;
3077 }
3078
3079 // Construct the initial value of the vector IV in the vector loop preheader.
3080 Type *IVIntTy =
3082 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3083 if (StepTy->isFloatingPointTy())
3084 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3085
3086 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3087 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3088
3089 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3090 Init =
3091 Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
3092
3093 // Create the widened phi of the vector IV.
3094 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
3095 WidenIVR->getDebugLoc(), "vec.ind");
3096 WidePHI->addOperand(Init);
3097 WidePHI->insertBefore(WidenIVR);
3098
3099 // Create the backedge value for the vector IV.
3100 VPValue *Inc;
3101 VPValue *Prev;
3102 // If unrolled, use the increment and prev value from the operands.
3103 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3104 Inc = SplatVF;
3105 Prev = WidenIVR->getLastUnrolledPartOperand();
3106 } else {
3107 if (VPRecipeBase *R = VF->getDefiningRecipe())
3108 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3109 // Multiply the vectorization factor by the step using integer or
3110 // floating-point arithmetic as appropriate.
3111 if (StepTy->isFloatingPointTy())
3112 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3113 DL);
3114 else
3115 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3116 TypeInfo.inferScalarType(VF), DL);
3117
3118 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3119 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3120 Prev = WidePHI;
3121 }
3122
3124 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3125 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3126 WidenIVR->getDebugLoc(), "vec.ind.next");
3127
3128 WidePHI->addOperand(Next);
3129
3130 WidenIVR->replaceAllUsesWith(WidePHI);
3131}
3132
3133/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3134/// initial value, phi and backedge value. In the following example:
3135///
3136/// <x1> vector loop: {
3137/// vector.body:
3138/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3139/// ...
3140/// EMIT branch-on-count ...
3141/// }
3142///
3143/// WIDEN-POINTER-INDUCTION will get expanded to:
3144///
3145/// <x1> vector loop: {
3146/// vector.body:
3147/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3148/// EMIT %mul = mul %stepvector, %step
3149/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3150/// ...
3151/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3152/// EMIT branch-on-count ...
3153/// }
3155 VPTypeAnalysis &TypeInfo) {
3156 VPlan *Plan = R->getParent()->getPlan();
3157 VPValue *Start = R->getStartValue();
3158 VPValue *Step = R->getStepValue();
3159 VPValue *VF = R->getVFValue();
3160
3161 assert(R->getInductionDescriptor().getKind() ==
3163 "Not a pointer induction according to InductionDescriptor!");
3164 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3165 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3166 "Recipe should have been replaced");
3167
3168 VPBuilder Builder(R);
3169 DebugLoc DL = R->getDebugLoc();
3170
3171 // Build a scalar pointer phi.
3172 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3173
3174 // Create actual address geps that use the pointer phi as base and a
3175 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3176 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3177 Type *StepTy = TypeInfo.inferScalarType(Step);
3178 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3179 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3180 VPValue *PtrAdd = Builder.createNaryOp(
3181 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3182 R->replaceAllUsesWith(PtrAdd);
3183
3184 // Create the backedge value for the scalar pointer phi.
3186 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3187 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3188 DL);
3189 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3190
3191 VPValue *InductionGEP =
3192 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3193 ScalarPtrPhi->addOperand(InductionGEP);
3194}
3195
3197 // Replace loop regions with explicity CFG.
3198 SmallVector<VPRegionBlock *> LoopRegions;
3199 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
3200 vp_depth_first_deep(Plan.getEntry()))) {
3201 if (!R->isReplicator())
3202 LoopRegions.push_back(R);
3203 }
3204 for (VPRegionBlock *R : LoopRegions)
3205 R->dissolveToCFGLoop();
3206}
3207
3209 VPTypeAnalysis TypeInfo(Plan);
3211 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3212 vp_depth_first_deep(Plan.getEntry()))) {
3213 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3214 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3215 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3216 ToRemove.push_back(WidenIVR);
3217 continue;
3218 }
3219
3220 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3221 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3222 ToRemove.push_back(WidenIVR);
3223 continue;
3224 }
3225
3226 // Expand VPBlendRecipe into VPInstruction::Select.
3227 VPBuilder Builder(&R);
3228 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3229 VPValue *Select = Blend->getIncomingValue(0);
3230 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3231 Select = Builder.createSelect(Blend->getMask(I),
3232 Blend->getIncomingValue(I), Select,
3233 R.getDebugLoc(), "predphi");
3234 Blend->replaceAllUsesWith(Select);
3235 ToRemove.push_back(Blend);
3236 }
3237
3238 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3239 Expr->decompose();
3240 ToRemove.push_back(Expr);
3241 }
3242
3243 VPValue *VectorStep;
3244 VPValue *ScalarStep;
3245 if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
3246 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3247 continue;
3248
3249 // Expand WideIVStep.
3250 auto *VPI = cast<VPInstruction>(&R);
3251 Type *IVTy = TypeInfo.inferScalarType(VPI);
3252 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3254 ? Instruction::UIToFP
3255 : Instruction::Trunc;
3256 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3257 }
3258
3259 [[maybe_unused]] auto *ConstStep =
3260 ScalarStep->isLiveIn()
3261 ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
3262 : nullptr;
3263 assert(!ConstStep || ConstStep->getValue() != 1);
3264 (void)ConstStep;
3265 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3266 ScalarStep =
3267 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3268 }
3269
3270 VPIRFlags Flags;
3271 if (IVTy->isFloatingPointTy())
3272 Flags = {VPI->getFastMathFlags()};
3273
3274 unsigned MulOpc =
3275 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3276 VPInstruction *Mul = Builder.createNaryOp(
3277 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3278 VectorStep = Mul;
3279 VPI->replaceAllUsesWith(VectorStep);
3280 ToRemove.push_back(VPI);
3281 }
3282 }
3283
3284 for (VPRecipeBase *R : ToRemove)
3285 R->eraseFromParent();
3286}
3287
3289 VPBasicBlock *EarlyExitVPBB,
3290 VPlan &Plan,
3291 VPBasicBlock *HeaderVPBB,
3292 VPBasicBlock *LatchVPBB) {
3293 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3294 if (!EarlyExitVPBB->getSinglePredecessor() &&
3295 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3296 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3297 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3298 "unsupported early exit VPBB");
3299 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3300 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3301 // of the phis.
3302 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3303 cast<VPIRPhi>(&R)->swapOperands();
3304 }
3305
3306 VPBuilder Builder(LatchVPBB->getTerminator());
3307 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3308 assert(
3309 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
3310 "Terminator must be be BranchOnCond");
3311 VPValue *CondOfEarlyExitingVPBB =
3312 EarlyExitingVPBB->getTerminator()->getOperand(0);
3313 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3314 ? CondOfEarlyExitingVPBB
3315 : Builder.createNot(CondOfEarlyExitingVPBB);
3316
3317 // Split the middle block and have it conditionally branch to the early exit
3318 // block if CondToEarlyExit.
3319 VPValue *IsEarlyExitTaken =
3320 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3321 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3322 VPBasicBlock *VectorEarlyExitVPBB =
3323 Plan.createVPBasicBlock("vector.early.exit");
3324 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3325 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3326 NewMiddle->swapSuccessors();
3327
3328 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3329
3330 // Update the exit phis in the early exit block.
3331 VPBuilder MiddleBuilder(NewMiddle);
3332 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3333 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3334 auto *ExitIRI = cast<VPIRPhi>(&R);
3335 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3336 // a single predecessor and 1 if it has two.
3337 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3338 if (ExitIRI->getNumOperands() != 1) {
3339 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3340 // predecessor. Extract its last lane.
3341 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3342 }
3343
3344 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3345 if (!IncomingFromEarlyExit->isLiveIn()) {
3346 // Update the incoming value from the early exit.
3347 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3348 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3349 "first.active.lane");
3350 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3351 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3352 nullptr, "early.exit.value");
3353 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3354 }
3355 }
3356 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3357
3358 // Replace the condition controlling the non-early exit from the vector loop
3359 // with one exiting if either the original condition of the vector latch is
3360 // true or the early exit has been taken.
3361 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3362 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3363 "Unexpected terminator");
3364 auto *IsLatchExitTaken =
3365 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3366 LatchExitingBranch->getOperand(1));
3367 auto *AnyExitTaken = Builder.createNaryOp(
3368 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3369 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3370 LatchExitingBranch->eraseFromParent();
3371}
3372
3373/// This function tries convert extended in-loop reductions to
3374/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3375/// valid. The created recipe must be decomposed to its constituent
3376/// recipes before execution.
3377static VPExpressionRecipe *
3379 VFRange &Range) {
3380 Type *RedTy = Ctx.Types.inferScalarType(Red);
3381 VPValue *VecOp = Red->getVecOp();
3382
3383 // Clamp the range if using extended-reduction is profitable.
3384 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
3385 Type *SrcTy) -> bool {
3387 [&](ElementCount VF) {
3388 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3391 Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
3392 CostKind);
3393 InstructionCost ExtCost =
3394 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3395 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3396 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3397 },
3398 Range);
3399 };
3400
3401 VPValue *A;
3402 // Match reduce(ext)).
3403 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3404 IsExtendedRedValidAndClampRange(
3405 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3406 cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
3407 Instruction::CastOps::ZExt,
3408 Ctx.Types.inferScalarType(A)))
3409 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3410
3411 return nullptr;
3412}
3413
3414/// This function tries convert extended in-loop reductions to
3415/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3416/// and valid. The created VPExpressionRecipe must be decomposed to its
3417/// constituent recipes before execution. Patterns of the
3418/// VPExpressionRecipe:
3419/// reduce.add(mul(...)),
3420/// reduce.add(mul(ext(A), ext(B))),
3421/// reduce.add(ext(mul(ext(A), ext(B)))).
3422static VPExpressionRecipe *
3424 VPCostContext &Ctx, VFRange &Range) {
3425 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3426 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3427 return nullptr;
3428
3429 Type *RedTy = Ctx.Types.inferScalarType(Red);
3430
3431 // Clamp the range if using multiply-accumulate-reduction is profitable.
3432 auto IsMulAccValidAndClampRange =
3433 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
3434 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
3436 [&](ElementCount VF) {
3438 Type *SrcTy =
3439 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3440 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3442 isZExt, Opcode, RedTy, SrcVecTy, CostKind);
3443 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3444 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3445 InstructionCost ExtCost = 0;
3446 if (Ext0)
3447 ExtCost += Ext0->computeCost(VF, Ctx);
3448 if (Ext1)
3449 ExtCost += Ext1->computeCost(VF, Ctx);
3450 if (OuterExt)
3451 ExtCost += OuterExt->computeCost(VF, Ctx);
3452
3453 return MulAccCost.isValid() &&
3454 MulAccCost < ExtCost + MulCost + RedCost;
3455 },
3456 Range);
3457 };
3458
3459 VPValue *VecOp = Red->getVecOp();
3460 VPValue *A, *B;
3461 // Try to match reduce.add(mul(...)).
3462 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3463 auto *RecipeA =
3464 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3465 auto *RecipeB =
3466 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3467 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3468
3469 // Match reduce.add(mul(ext, ext)).
3470 if (RecipeA && RecipeB &&
3471 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
3472 match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3473 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3474 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
3475 Instruction::CastOps::ZExt,
3476 Mul, RecipeA, RecipeB, nullptr)) {
3477 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3478 }
3479 // Match reduce.add(mul).
3480 if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
3481 return new VPExpressionRecipe(Mul, Red);
3482 }
3483 // Match reduce.add(ext(mul(ext(A), ext(B)))).
3484 // All extend recipes must have same opcode or A == B
3485 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
3487 m_ZExtOrSExt(m_VPValue()))))) {
3488 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3489 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3490 auto *Ext0 =
3491 cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
3492 auto *Ext1 =
3493 cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
3494 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3495 Ext0->getOpcode() == Ext1->getOpcode() &&
3496 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
3497 Instruction::CastOps::ZExt,
3498 Mul, Ext0, Ext1, Ext)) {
3499 auto *NewExt0 = new VPWidenCastRecipe(
3500 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3501 Ext0->getDebugLoc());
3502 NewExt0->insertBefore(Ext0);
3503
3504 VPWidenCastRecipe *NewExt1 = NewExt0;
3505 if (Ext0 != Ext1) {
3506 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3507 Ext->getResultType(), *Ext1,
3508 Ext1->getDebugLoc());
3509 NewExt1->insertBefore(Ext1);
3510 }
3511 Mul->setOperand(0, NewExt0);
3512 Mul->setOperand(1, NewExt1);
3513 Red->setOperand(1, Mul);
3514 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3515 }
3516 }
3517 return nullptr;
3518}
3519
3520/// This function tries to create abstract recipes from the reduction recipe for
3521/// following optimizations and cost estimation.
3523 VPCostContext &Ctx,
3524 VFRange &Range) {
3525 VPExpressionRecipe *AbstractR = nullptr;
3526 auto IP = std::next(Red->getIterator());
3527 auto *VPBB = Red->getParent();
3528 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3529 AbstractR = MulAcc;
3530 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3531 AbstractR = ExtRed;
3532 // Cannot create abstract inloop reduction recipes.
3533 if (!AbstractR)
3534 return;
3535
3536 AbstractR->insertBefore(*VPBB, IP);
3537 Red->replaceAllUsesWith(AbstractR);
3538}
3539
3541 VFRange &Range) {
3542 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3544 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3545 if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
3547 }
3548 }
3549}
3550
3552 if (Plan.hasScalarVFOnly())
3553 return;
3554
3555#ifndef NDEBUG
3556 VPDominatorTree VPDT;
3557 VPDT.recalculate(Plan);
3558#endif
3559
3560 SmallVector<VPValue *> VPValues;
3563 append_range(VPValues, Plan.getLiveIns());
3564 for (VPRecipeBase &R : *Plan.getEntry())
3565 append_range(VPValues, R.definedValues());
3566
3567 auto *VectorPreheader = Plan.getVectorPreheader();
3568 for (VPValue *VPV : VPValues) {
3570 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3571 isa<Constant>(VPV->getLiveInIRValue())))
3572 continue;
3573
3574 // Add explicit broadcast at the insert point that dominates all users.
3575 VPBasicBlock *HoistBlock = VectorPreheader;
3576 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3577 for (VPUser *User : VPV->users()) {
3578 if (User->usesScalars(VPV))
3579 continue;
3580 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3581 HoistPoint = HoistBlock->begin();
3582 else
3583 assert(VPDT.dominates(VectorPreheader,
3584 cast<VPRecipeBase>(User)->getParent()) &&
3585 "All users must be in the vector preheader or dominated by it");
3586 }
3587
3588 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3589 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3590 VPV->replaceUsesWithIf(Broadcast,
3591 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3592 return Broadcast != &U && !U.usesScalars(VPV);
3593 });
3594 }
3595}
3596
3598 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3600 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3601 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3602
3603 VPValue *TC = Plan.getTripCount();
3604 // Skip cases for which the trip count may be non-trivial to materialize.
3605 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3606 // tail is required.
3607 if (!Plan.hasScalarTail() ||
3609 Plan.getScalarPreheader() ||
3610 !TC->isLiveIn())
3611 return;
3612
3613 // Materialize vector trip counts for constants early if it can simply
3614 // be computed as (Original TC / VF * UF) * VF * UF.
3615 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3616 // tail-folded loops.
3617 ScalarEvolution &SE = *PSE.getSE();
3618 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3619 if (!isa<SCEVConstant>(TCScev))
3620 return;
3621 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3622 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3623 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3624 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3625}
3626
3628 VPBasicBlock *VectorPH) {
3630 if (BTC->getNumUsers() == 0)
3631 return;
3632
3633 VPBuilder Builder(VectorPH, VectorPH->begin());
3634 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3635 auto *TCMO = Builder.createNaryOp(
3636 Instruction::Sub,
3637 {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
3638 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3639 BTC->replaceAllUsesWith(TCMO);
3640}
3641
3643 if (Plan.hasScalarVFOnly())
3644 return;
3645
3646 VPTypeAnalysis TypeInfo(Plan);
3647 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3648 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3650 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3651 vp_depth_first_shallow(LoopRegion->getEntry()));
3652 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
3653 // excluding ones in replicate regions. Those are not materialized explicitly
3654 // yet. Those vector users are still handled in VPReplicateRegion::execute(),
3655 // via shouldPack().
3656 // TODO: materialize build vectors for replicating recipes in replicating
3657 // regions.
3658 // TODO: materialize build vectors for VPInstructions.
3659 for (VPBasicBlock *VPBB :
3660 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3661 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3662 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3663 auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
3664 VPRegionBlock *ParentRegion =
3665 cast<VPRecipeBase>(U)->getParent()->getParent();
3666 return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
3667 };
3668 if (!RepR || RepR->isSingleScalar() ||
3669 none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
3670 continue;
3671
3672 Type *ScalarTy = TypeInfo.inferScalarType(RepR);
3673 unsigned Opcode = ScalarTy->isStructTy()
3676 auto *BuildVector = new VPInstruction(Opcode, {RepR});
3677 BuildVector->insertAfter(RepR);
3678
3679 RepR->replaceUsesWithIf(
3680 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3681 VPUser &U, unsigned) {
3682 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3683 });
3684 }
3685 }
3686}
3687
3689 VPBasicBlock *VectorPHVPBB,
3690 bool TailByMasking,
3691 bool RequiresScalarEpilogue) {
3692 VPValue &VectorTC = Plan.getVectorTripCount();
3693 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3694 // There's nothing to do if there are no users of the vector trip count or its
3695 // IR value has already been set.
3696 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3697 return;
3698
3699 VPValue *TC = Plan.getTripCount();
3700 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3701 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3702 VPValue *Step = &Plan.getVFxUF();
3703
3704 // If the tail is to be folded by masking, round the number of iterations N
3705 // up to a multiple of Step instead of rounding down. This is done by first
3706 // adding Step-1 and then rounding down. Note that it's ok if this addition
3707 // overflows: the vector induction variable will eventually wrap to zero given
3708 // that it starts at zero and its Step is a power of two; the loop will then
3709 // exit, with the last early-exit vector comparison also producing all-true.
3710 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3711 // is accounted for in emitIterationCountCheck that adds an overflow check.
3712 if (TailByMasking) {
3713 TC = Builder.createNaryOp(
3714 Instruction::Add,
3715 {TC, Builder.createNaryOp(
3716 Instruction::Sub,
3717 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3718 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3719 }
3720
3721 // Now we need to generate the expression for the part of the loop that the
3722 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3723 // iterations are not required for correctness, or N - Step, otherwise. Step
3724 // is equal to the vectorization factor (number of SIMD elements) times the
3725 // unroll factor (number of SIMD instructions).
3726 VPValue *R =
3727 Builder.createNaryOp(Instruction::URem, {TC, Step},
3728 DebugLoc::getCompilerGenerated(), "n.mod.vf");
3729
3730 // There are cases where we *must* run at least one iteration in the remainder
3731 // loop. See the cost model for when this can happen. If the step evenly
3732 // divides the trip count, we set the remainder to be equal to the step. If
3733 // the step does not evenly divide the trip count, no adjustment is necessary
3734 // since there will already be scalar iterations. Note that the minimum
3735 // iterations check ensures that N >= Step.
3736 if (RequiresScalarEpilogue) {
3737 assert(!TailByMasking &&
3738 "requiring scalar epilogue is not supported with fail folding");
3739 VPValue *IsZero = Builder.createICmp(
3740 CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
3741 R = Builder.createSelect(IsZero, Step, R);
3742 }
3743
3744 VPValue *Res = Builder.createNaryOp(
3745 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
3746 VectorTC.replaceAllUsesWith(Res);
3747}
3748
3750 ElementCount VFEC) {
3751 VPBuilder Builder(VectorPH, VectorPH->begin());
3752 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3753 VPValue &VF = Plan.getVF();
3754 VPValue &VFxUF = Plan.getVFxUF();
3755 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
3756 // used.
3757 // TODO: Assert that they aren't used.
3758
3759 // If there are no users of the runtime VF, compute VFxUF by constant folding
3760 // the multiplication of VF and UF.
3761 if (VF.getNumUsers() == 0) {
3762 VPValue *RuntimeVFxUF =
3763 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
3764 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
3765 return;
3766 }
3767
3768 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
3769 // vscale) * UF.
3770 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
3772 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
3774 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
3775 }
3776 VF.replaceAllUsesWith(RuntimeVF);
3777
3778 VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
3779 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
3780 VFxUF.replaceAllUsesWith(MulByUF);
3781}
3782
3785 const DataLayout &DL = SE.getDataLayout();
3786 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
3787
3788 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
3789 BasicBlock *EntryBB = Entry->getIRBasicBlock();
3790 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
3791 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
3792 if (isa<VPIRInstruction, VPIRPhi>(&R))
3793 continue;
3794 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3795 if (!ExpSCEV)
3796 break;
3797 const SCEV *Expr = ExpSCEV->getSCEV();
3798 Value *Res =
3799 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
3800 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
3801 VPValue *Exp = Plan.getOrAddLiveIn(Res);
3802 ExpSCEV->replaceAllUsesWith(Exp);
3803 if (Plan.getTripCount() == ExpSCEV)
3804 Plan.resetTripCount(Exp);
3805 ExpSCEV->eraseFromParent();
3806 }
3807 assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
3808 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
3809 "after any VPIRInstructions");
3810 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
3811 // to the VPIRBasicBlock.
3812 auto EI = Entry->begin();
3813 for (Instruction &I : drop_end(*EntryBB)) {
3814 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
3815 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
3816 EI++;
3817 continue;
3818 }
3820 }
3821
3822 return ExpandedSCEVs;
3823}
3824
3825/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3826/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
3827/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
3828/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
3829/// an index-independent load if it feeds all wide ops at all indices (\p OpV
3830/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
3831/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3832/// is defined at \p Idx of a load interleave group.
3833static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
3834 VPValue *OpV, unsigned Idx) {
3835 auto *DefR = OpV->getDefiningRecipe();
3836 if (!DefR)
3837 return WideMember0->getOperand(OpIdx) == OpV;
3838 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
3839 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
3840
3841 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
3842 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
3843 return false;
3844}
3845
3846/// Returns true if \p IR is a full interleave group with factor and number of
3847/// members both equal to \p VF. The interleave group must also access the full
3848/// vector width \p VectorRegWidth.
3850 unsigned VF, VPTypeAnalysis &TypeInfo,
3851 unsigned VectorRegWidth) {
3852 if (!InterleaveR)
3853 return false;
3854
3855 Type *GroupElementTy = nullptr;
3856 if (InterleaveR->getStoredValues().empty()) {
3857 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
3858 if (!all_of(InterleaveR->definedValues(),
3859 [&TypeInfo, GroupElementTy](VPValue *Op) {
3860 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3861 }))
3862 return false;
3863 } else {
3864 GroupElementTy =
3865 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
3866 if (!all_of(InterleaveR->getStoredValues(),
3867 [&TypeInfo, GroupElementTy](VPValue *Op) {
3868 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3869 }))
3870 return false;
3871 }
3872
3873 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3874 auto IG = InterleaveR->getInterleaveGroup();
3875 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3876 GroupSize == VectorRegWidth;
3877}
3878
3879/// Returns true if \p VPValue is a narrow VPValue.
3880static bool isAlreadyNarrow(VPValue *VPV) {
3881 if (VPV->isLiveIn())
3882 return true;
3883 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
3884 return RepR && RepR->isSingleScalar();
3885}
3886
3888 unsigned VectorRegWidth) {
3889 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3890 if (!VectorLoop)
3891 return;
3892
3893 VPTypeAnalysis TypeInfo(Plan);
3894
3895 unsigned VFMinVal = VF.getKnownMinValue();
3897 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3898 if (isa<VPCanonicalIVPHIRecipe>(&R) ||
3900 continue;
3901
3902 if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
3903 vputils::onlyFirstLaneUsed(cast<VPSingleDefRecipe>(&R)))
3904 continue;
3905
3906 // Bail out on recipes not supported at the moment:
3907 // * phi recipes other than the canonical induction
3908 // * recipes writing to memory except interleave groups
3909 // Only support plans with a canonical induction phi.
3910 if (R.isPhi())
3911 return;
3912
3913 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
3914 if (R.mayWriteToMemory() && !InterleaveR)
3915 return;
3916
3917 // Do not narrow interleave groups if there are VectorPointer recipes and
3918 // the plan was unrolled. The recipe implicitly uses VF from
3919 // VPTransformState.
3920 // TODO: Remove restriction once the VF for the VectorPointer offset is
3921 // modeled explicitly as operand.
3922 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
3923 return;
3924
3925 // All other ops are allowed, but we reject uses that cannot be converted
3926 // when checking all allowed consumers (store interleave groups) below.
3927 if (!InterleaveR)
3928 continue;
3929
3930 // Bail out on non-consecutive interleave groups.
3931 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
3932 VectorRegWidth))
3933 return;
3934
3935 // Skip read interleave groups.
3936 if (InterleaveR->getStoredValues().empty())
3937 continue;
3938
3939 // Narrow interleave groups, if all operands are already matching narrow
3940 // ops.
3941 auto *Member0 = InterleaveR->getStoredValues()[0];
3942 if (isAlreadyNarrow(Member0) &&
3943 all_of(InterleaveR->getStoredValues(),
3944 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
3945 StoreGroups.push_back(InterleaveR);
3946 continue;
3947 }
3948
3949 // For now, we only support full interleave groups storing load interleave
3950 // groups.
3951 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
3952 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3953 if (!DefR)
3954 return false;
3955 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
3956 return IR && IR->getInterleaveGroup()->isFull() &&
3957 IR->getVPValue(Op.index()) == Op.value();
3958 })) {
3959 StoreGroups.push_back(InterleaveR);
3960 continue;
3961 }
3962
3963 // Check if all values feeding InterleaveR are matching wide recipes, which
3964 // operands that can be narrowed.
3965 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3966 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
3967 if (!WideMember0)
3968 return;
3969 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
3970 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
3971 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
3972 R->getNumOperands() > 2)
3973 return;
3974 if (any_of(enumerate(R->operands()),
3975 [WideMember0, Idx = I](const auto &P) {
3976 const auto &[OpIdx, OpV] = P;
3977 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
3978 }))
3979 return;
3980 }
3981 StoreGroups.push_back(InterleaveR);
3982 }
3983
3984 if (StoreGroups.empty())
3985 return;
3986
3987 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3988 auto NarrowOp = [](VPValue *V) -> VPValue * {
3989 auto *R = V->getDefiningRecipe();
3990 if (!R)
3991 return V;
3992 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
3993 // Narrow interleave group to wide load, as transformed VPlan will only
3994 // process one original iteration.
3995 auto *L = new VPWidenLoadRecipe(
3996 *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
3997 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
3998 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
3999 L->insertBefore(LoadGroup);
4000 return L;
4001 }
4002
4003 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4004 assert(RepR->isSingleScalar() &&
4005 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4006 "must be a single scalar load");
4007 return RepR;
4008 }
4009 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4010
4011 VPValue *PtrOp = WideLoad->getAddr();
4012 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4013 PtrOp = VecPtr->getOperand(0);
4014 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4015 // process one original iteration.
4016 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4017 /*IsUniform*/ true,
4018 /*Mask*/ nullptr, *WideLoad);
4019 N->insertBefore(WideLoad);
4020 return N;
4021 };
4022
4023 // Narrow operation tree rooted at store groups.
4024 for (auto *StoreGroup : StoreGroups) {
4025 VPValue *Res = nullptr;
4026 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4027 if (isAlreadyNarrow(Member0)) {
4028 Res = Member0;
4029 } else if (auto *WideMember0 =
4030 dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {
4031 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4032 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4033 Res = WideMember0;
4034 } else {
4035 Res = NarrowOp(Member0);
4036 }
4037
4038 auto *S = new VPWidenStoreRecipe(
4039 *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
4040 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4041 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4042 S->insertBefore(StoreGroup);
4043 StoreGroup->eraseFromParent();
4044 }
4045
4046 // Adjust induction to reflect that the transformed plan only processes one
4047 // original iteration.
4048 auto *CanIV = Plan.getCanonicalIV();
4049 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4050 VPBuilder PHBuilder(Plan.getVectorPreheader());
4051
4052 VPValue *UF = Plan.getOrAddLiveIn(
4053 ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
4054 if (VF.isScalable()) {
4055 VPValue *VScale = PHBuilder.createElementCount(
4056 CanIV->getScalarType(), ElementCount::getScalable(1));
4057 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4058 Inc->setOperand(1, VScaleUF);
4059 Plan.getVF().replaceAllUsesWith(VScale);
4060 } else {
4061 Inc->setOperand(1, UF);
4063 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
4064 }
4065 removeDeadRecipes(Plan);
4066}
4067
4068/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4069/// BranchOnCond recipe.
4071 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4072 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4073 auto *MiddleTerm =
4074 dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
4075 // Only add branch metadata if there is a (conditional) terminator.
4076 if (!MiddleTerm)
4077 return;
4078
4079 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4080 "must have a BranchOnCond");
4081 // Assume that `TripCount % VectorStep ` is equally distributed.
4082 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4083 if (VF.isScalable() && VScaleForTuning.has_value())
4084 VectorStep *= *VScaleForTuning;
4085 assert(VectorStep > 0 && "trip count should not be zero");
4086 MDBuilder MDB(Plan.getContext());
4087 MDNode *BranchWeights =
4088 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4089 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4090}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
@ Default
Definition: DwarfDebug.cpp:86
uint64_t Addr
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
iv Induction Variable Users
Definition: IVUsers.cpp:48
licm
Definition: LICM.cpp:381
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static Value * tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition: DebugLoc.h:163
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:187
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:229
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:327
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:315
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
Value * FoldGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, GEPNoWrapFlags NW) const override
Value * FoldBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS) const override
Value * FoldSelect(Value *C, Value *True, Value *False) const override
Value * FoldCast(Instruction::CastOps Op, Value *V, Type *DestTy) const override
Value * FoldCmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override
bool isCast() const
Definition: Instruction.h:321
bool isBinaryOp() const
Definition: Instruction.h:317
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:86
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:524
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:594
uint32_t getNumMembers() const
Definition: VectorUtils.h:542
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1570
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition: MDBuilder.cpp:38
Metadata node.
Definition: Metadata.h:1077
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:99
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
unsigned getOpcode() const
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
iterator begin() const
Definition: SmallPtrSet.h:494
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
Provides information about what library functions are available for the current target.
LLVM_ABI InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add/...
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:261
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
op_range operands()
Definition: User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition: VPlan.h:3458
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3745
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3820
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3772
iterator end()
Definition: VPlan.h:3782
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3780
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3833
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:236
VPRegionBlock * getEnclosingLoopRegion()
Definition: VPlan.cpp:625
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition: VPlan.cpp:597
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition: VPlan.cpp:670
const VPRecipeBase & back() const
Definition: VPlan.h:3794
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2390
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:81
VPRegionBlock * getParent()
Definition: VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:180
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:322
size_t getNumPredecessors() const
Definition: VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition: VPlan.h:204
VPlan * getPlan()
Definition: VPlan.cpp:155
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:160
VPBlockBase * getSingleHierarchicalPredecessor()
Definition: VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:198
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlanUtils.h:237
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition: VPlanUtils.h:156
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:175
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:194
A recipe for generating conditional branches on the bits of a mask.
Definition: VPlan.h:2915
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
VPScalarIVStepsRecipe * createScalarIVSteps(Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step, VPValue *VF, DebugLoc DL)
VPInstruction * createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:3401
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:3428
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition: VPlanValue.h:422
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:417
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:395
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:407
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition: VPlan.h:3489
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition: VPlan.h:2960
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:2012
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:2001
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3898
Class to record and manage LLVM IR flags.
Definition: VPlan.h:600
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition: VPlan.h:935
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:976
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition: VPlan.h:1053
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:982
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition: VPlan.h:1006
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition: VPlan.h:1003
@ CanonicalIVIncrementForPart
Definition: VPlan.h:996
@ CalculateTripCountMinusVF
Definition: VPlan.h:994
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition: VPlan.h:2525
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2546
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition: VPlan.h:2598
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2557
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition: VPlan.h:3072
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:394
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition: VPlan.h:477
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPBasicBlock * getParent()
Definition: VPlan.h:415
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:482
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition: VPlan.h:2793
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2647
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3933
const VPBlockBase * getEntry() const
Definition: VPlan.h:3969
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition: VPlan.h:3986
const VPBlockBase * getExiting() const
Definition: VPlan.h:3981
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition: VPlan.h:3994
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2837
bool isSingleScalar() const
Definition: VPlan.h:2882
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition: VPlan.h:2906
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition: VPlan.h:3635
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:521
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:586
virtual VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:43
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Definition: VPlanAnalysis.h:67
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:197
operand_range operands()
Definition: VPlanValue.h:265
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:241
operand_iterator op_end()
Definition: VPlanValue.h:263
operand_iterator op_begin()
Definition: VPlanValue.h:261
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:236
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:230
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:125
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:174
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition: VPlanValue.h:184
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1406
unsigned getNumUsers() const
Definition: VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:169
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1410
user_range users()
Definition: VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition: VPlan.h:1830
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition: VPlan.h:1874
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3530
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1480
A recipe for handling GEP instructions.
Definition: VPlan.h:1766
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:2029
PHINode * getPHINode() const
Definition: VPlan.h:2071
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:2057
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:2074
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:2104
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition: VPlan.h:2185
A recipe for widening vector intrinsics.
Definition: VPlan.h:1537
A common base class for widening memory operations.
Definition: VPlan.h:3114
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:3176
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:3169
A recipe for widened phis.
Definition: VPlan.h:2240
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1437
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:4036
bool hasVF(ElementCount VF) const
Definition: VPlan.h:4245
LLVMContext & getContext() const
Definition: VPlan.h:4233
VPBasicBlock * getEntry()
Definition: VPlan.h:4135
VPRegionBlock * createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="", bool IsReplicator=false)
Create a new VPRegionBlock with Entry, Exiting and Name.
Definition: VPlan.h:4376
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:4225
bool hasScalableVF() const
Definition: VPlan.h:4246
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:4231
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:4228
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:4197
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition: VPlan.h:4302
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:4218
unsigned getUF() const
Definition: VPlan.h:4265
bool hasUF(unsigned UF) const
Definition: VPlan.h:4263
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition: VPlan.h:4187
void setVF(ElementCount VF)
Definition: VPlan.h:4239
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition: VPlan.h:4278
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1040
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:4211
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:4160
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition: VPlan.h:4366
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:4287
bool hasScalarVFOnly() const
Definition: VPlan.h:4256
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:4178
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition: VPlan.h:4317
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:4341
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:4183
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition: VPlan.h:4314
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition: VPlan.h:4140
void setUF(unsigned UF)
Definition: VPlan.h:4270
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition: VPlan.h:4418
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:390
iterator_range< user_iterator > users()
Definition: Value.h:426
bool hasName() const
Definition: Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:259
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:175
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:76
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition: APInt.cpp:2763
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:980
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount, Op0_t, Op1_t > m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::BranchOnCond, Op0_t > m_BranchOnCond(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition: VPlanUtils.h:44
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:32
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:79
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
Definition: VPlanUtils.cpp:17
bool isHeaderMask(const VPValue *V, VPlan &Plan)
Return true if V is a header mask in Plan.
Definition: VPlanUtils.cpp:55
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
Definition: VPlanUtils.cpp:27
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:243
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition: VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:77
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1300
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:345
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:595
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2107
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition: Casting.h:836
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:469
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:54
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlanHelpers.h:71
Struct to hold various analysis needed for cost computations.
Definition: VPlanHelpers.h:344
VPTypeAnalysis Types
Definition: VPlanHelpers.h:347
const TargetTransformInfo & TTI
Definition: VPlanHelpers.h:345
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2283
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition: VPlan.h:3236
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:3196
A recipe for widening select instructions.
Definition: VPlan.h:1720
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition: VPlan.h:3318
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:3276
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.