LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
33#include "llvm/IR/Intrinsics.h"
34#include "llvm/IR/MDBuilder.h"
38
39using namespace llvm;
40using namespace VPlanPatternMatch;
41
43 VPlanPtr &Plan,
45 GetIntOrFpInductionDescriptor,
46 const TargetLibraryInfo &TLI) {
47
49 Plan->getVectorLoopRegion());
50 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
51 // Skip blocks outside region
52 if (!VPBB->getParent())
53 break;
54 VPRecipeBase *Term = VPBB->getTerminator();
55 auto EndIter = Term ? Term->getIterator() : VPBB->end();
56 // Introduce each ingredient into VPlan.
57 for (VPRecipeBase &Ingredient :
58 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
59
60 VPValue *VPV = Ingredient.getVPSingleValue();
61 if (!VPV->getUnderlyingValue())
62 continue;
63
64 Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
65
66 VPRecipeBase *NewRecipe = nullptr;
67 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
68 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
69 const auto *II = GetIntOrFpInductionDescriptor(Phi);
70 if (!II) {
71 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
72 for (VPValue *Op : PhiR->operands())
73 NewRecipe->addOperand(Op);
74 } else {
75 VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
76 VPValue *Step =
78 NewRecipe = new VPWidenIntOrFpInductionRecipe(
79 Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
80 }
81 } else {
82 assert(isa<VPInstruction>(&Ingredient) &&
83 "only VPInstructions expected here");
84 assert(!isa<PHINode>(Inst) && "phis should be handled above");
85 // Create VPWidenMemoryRecipe for loads and stores.
86 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
87 NewRecipe = new VPWidenLoadRecipe(
88 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
89 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
90 Ingredient.getDebugLoc());
91 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
92 NewRecipe = new VPWidenStoreRecipe(
93 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
94 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
95 VPIRMetadata(*Store), Ingredient.getDebugLoc());
96 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
97 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
98 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
99 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
100 if (VectorID == Intrinsic::not_intrinsic)
101 return false;
102 NewRecipe = new VPWidenIntrinsicRecipe(
103 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
104 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
105 CI->getDebugLoc());
106 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
107 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
108 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
109 NewRecipe = new VPWidenCastRecipe(
110 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
111 } else {
112 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
113 }
114 }
115
116 NewRecipe->insertBefore(&Ingredient);
117 if (NewRecipe->getNumDefinedValues() == 1)
118 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
119 else
120 assert(NewRecipe->getNumDefinedValues() == 0 &&
121 "Only recpies with zero or one defined values expected");
122 Ingredient.eraseFromParent();
123 }
124 }
125 return true;
126}
127
128static bool sinkScalarOperands(VPlan &Plan) {
129 auto Iter = vp_depth_first_deep(Plan.getEntry());
130 bool Changed = false;
131 // First, collect the operands of all recipes in replicate blocks as seeds for
132 // sinking.
134 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
135 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
136 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
137 continue;
138 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
139 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
140 continue;
141 for (auto &Recipe : *VPBB) {
142 for (VPValue *Op : Recipe.operands())
143 if (auto *Def =
144 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
145 WorkList.insert({VPBB, Def});
146 }
147 }
148
149 bool ScalarVFOnly = Plan.hasScalarVFOnly();
150 // Try to sink each replicate or scalar IV steps recipe in the worklist.
151 for (unsigned I = 0; I != WorkList.size(); ++I) {
152 VPBasicBlock *SinkTo;
153 VPSingleDefRecipe *SinkCandidate;
154 std::tie(SinkTo, SinkCandidate) = WorkList[I];
155 if (SinkCandidate->getParent() == SinkTo ||
156 SinkCandidate->mayHaveSideEffects() ||
157 SinkCandidate->mayReadOrWriteMemory())
158 continue;
159 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
160 if (!ScalarVFOnly && RepR->isSingleScalar())
161 continue;
162 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
163 continue;
164
165 bool NeedsDuplicating = false;
166 // All recipe users of the sink candidate must be in the same block SinkTo
167 // or all users outside of SinkTo must be uniform-after-vectorization (
168 // i.e., only first lane is used) . In the latter case, we need to duplicate
169 // SinkCandidate.
170 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
171 SinkCandidate](VPUser *U) {
172 auto *UI = cast<VPRecipeBase>(U);
173 if (UI->getParent() == SinkTo)
174 return true;
175 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
176 // We only know how to duplicate VPReplicateRecipes and
177 // VPScalarIVStepsRecipes for now.
178 return NeedsDuplicating &&
179 isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate);
180 };
181 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
182 continue;
183
184 if (NeedsDuplicating) {
185 if (ScalarVFOnly)
186 continue;
187 VPSingleDefRecipe *Clone;
188 if (auto *SinkCandidateRepR =
189 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
190 // TODO: Handle converting to uniform recipes as separate transform,
191 // then cloning should be sufficient here.
192 Instruction *I = SinkCandidate->getUnderlyingInstr();
193 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
194 nullptr /*Mask*/, *SinkCandidateRepR);
195 // TODO: add ".cloned" suffix to name of Clone's VPValue.
196 } else {
197 Clone = SinkCandidate->clone();
198 }
199
200 Clone->insertBefore(SinkCandidate);
201 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
202 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
203 });
204 }
205 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
206 for (VPValue *Op : SinkCandidate->operands())
207 if (auto *Def =
208 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
209 WorkList.insert({SinkTo, Def});
210 Changed = true;
211 }
212 return Changed;
213}
214
215/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
216/// the mask.
218 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
219 if (!EntryBB || EntryBB->size() != 1 ||
220 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
221 return nullptr;
222
223 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
224}
225
226/// If \p R is a triangle region, return the 'then' block of the triangle.
228 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
229 if (EntryBB->getNumSuccessors() != 2)
230 return nullptr;
231
232 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
233 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
234 if (!Succ0 || !Succ1)
235 return nullptr;
236
237 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
238 return nullptr;
239 if (Succ0->getSingleSuccessor() == Succ1)
240 return Succ0;
241 if (Succ1->getSingleSuccessor() == Succ0)
242 return Succ1;
243 return nullptr;
244}
245
246// Merge replicate regions in their successor region, if a replicate region
247// is connected to a successor replicate region with the same predicate by a
248// single, empty VPBasicBlock.
250 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
251
252 // Collect replicate regions followed by an empty block, followed by another
253 // replicate region with matching masks to process front. This is to avoid
254 // iterator invalidation issues while merging regions.
256 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
257 vp_depth_first_deep(Plan.getEntry()))) {
258 if (!Region1->isReplicator())
259 continue;
260 auto *MiddleBasicBlock =
261 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
262 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
263 continue;
264
265 auto *Region2 =
266 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
267 if (!Region2 || !Region2->isReplicator())
268 continue;
269
270 VPValue *Mask1 = getPredicatedMask(Region1);
271 VPValue *Mask2 = getPredicatedMask(Region2);
272 if (!Mask1 || Mask1 != Mask2)
273 continue;
274
275 assert(Mask1 && Mask2 && "both region must have conditions");
276 WorkList.push_back(Region1);
277 }
278
279 // Move recipes from Region1 to its successor region, if both are triangles.
280 for (VPRegionBlock *Region1 : WorkList) {
281 if (TransformedRegions.contains(Region1))
282 continue;
283 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
284 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
285
286 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
287 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
288 if (!Then1 || !Then2)
289 continue;
290
291 // Note: No fusion-preventing memory dependencies are expected in either
292 // region. Such dependencies should be rejected during earlier dependence
293 // checks, which guarantee accesses can be re-ordered for vectorization.
294 //
295 // Move recipes to the successor region.
296 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
297 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
298
299 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
300 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
301
302 // Move VPPredInstPHIRecipes from the merge block to the successor region's
303 // merge block. Update all users inside the successor region to use the
304 // original values.
305 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
306 VPValue *PredInst1 =
307 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
308 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
309 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
310 return cast<VPRecipeBase>(&U)->getParent() == Then2;
311 });
312
313 // Remove phi recipes that are unused after merging the regions.
314 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
315 Phi1ToMove.eraseFromParent();
316 continue;
317 }
318 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
319 }
320
321 // Remove the dead recipes in Region1's entry block.
322 for (VPRecipeBase &R :
323 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
324 R.eraseFromParent();
325
326 // Finally, remove the first region.
327 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
328 VPBlockUtils::disconnectBlocks(Pred, Region1);
329 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
330 }
331 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
332 TransformedRegions.insert(Region1);
333 }
334
335 return !TransformedRegions.empty();
336}
337
339 VPlan &Plan) {
340 Instruction *Instr = PredRecipe->getUnderlyingInstr();
341 // Build the triangular if-then region.
342 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
343 assert(Instr->getParent() && "Predicated instruction not in any basic block");
344 auto *BlockInMask = PredRecipe->getMask();
345 auto *MaskDef = BlockInMask->getDefiningRecipe();
346 auto *BOMRecipe = new VPBranchOnMaskRecipe(
347 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
348 auto *Entry =
349 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
350
351 // Replace predicated replicate recipe with a replicate recipe without a
352 // mask but in the replicate region.
353 auto *RecipeWithoutMask = new VPReplicateRecipe(
354 PredRecipe->getUnderlyingInstr(),
355 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
356 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
357 auto *Pred =
358 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
359
360 VPPredInstPHIRecipe *PHIRecipe = nullptr;
361 if (PredRecipe->getNumUsers() != 0) {
362 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
363 RecipeWithoutMask->getDebugLoc());
364 PredRecipe->replaceAllUsesWith(PHIRecipe);
365 PHIRecipe->setOperand(0, RecipeWithoutMask);
366 }
367 PredRecipe->eraseFromParent();
368 auto *Exiting =
369 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
371 Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
372
373 // Note: first set Entry as region entry and then connect successors starting
374 // from it in order, to propagate the "parent" of each VPBasicBlock.
375 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
376 VPBlockUtils::connectBlocks(Pred, Exiting);
377
378 return Region;
379}
380
381static void addReplicateRegions(VPlan &Plan) {
383 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
384 vp_depth_first_deep(Plan.getEntry()))) {
385 for (VPRecipeBase &R : *VPBB)
386 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
387 if (RepR->isPredicated())
388 WorkList.push_back(RepR);
389 }
390 }
391
392 unsigned BBNum = 0;
393 for (VPReplicateRecipe *RepR : WorkList) {
394 VPBasicBlock *CurrentBlock = RepR->getParent();
395 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
396
397 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
399 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
400 // Record predicated instructions for above packing optimizations.
402 Region->setParent(CurrentBlock->getParent());
404
405 VPRegionBlock *ParentRegion = Region->getParent();
406 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
407 ParentRegion->setExiting(SplitBlock);
408 }
409}
410
411/// Remove redundant VPBasicBlocks by merging them into their predecessor if
412/// the predecessor has a single successor.
415 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
416 vp_depth_first_deep(Plan.getEntry()))) {
417 // Don't fold the blocks in the skeleton of the Plan into their single
418 // predecessors for now.
419 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
420 if (!VPBB->getParent())
421 continue;
422 auto *PredVPBB =
423 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
424 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
425 isa<VPIRBasicBlock>(PredVPBB))
426 continue;
427 WorkList.push_back(VPBB);
428 }
429
430 for (VPBasicBlock *VPBB : WorkList) {
431 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
432 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
433 R.moveBefore(*PredVPBB, PredVPBB->end());
434 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
435 auto *ParentRegion = VPBB->getParent();
436 if (ParentRegion && ParentRegion->getExiting() == VPBB)
437 ParentRegion->setExiting(PredVPBB);
438 for (auto *Succ : to_vector(VPBB->successors())) {
440 VPBlockUtils::connectBlocks(PredVPBB, Succ);
441 }
442 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
443 }
444 return !WorkList.empty();
445}
446
448 // Convert masked VPReplicateRecipes to if-then region blocks.
450
451 bool ShouldSimplify = true;
452 while (ShouldSimplify) {
453 ShouldSimplify = sinkScalarOperands(Plan);
454 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
455 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
456 }
457}
458
459/// Remove redundant casts of inductions.
460///
461/// Such redundant casts are casts of induction variables that can be ignored,
462/// because we already proved that the casted phi is equal to the uncasted phi
463/// in the vectorized loop. There is no need to vectorize the cast - the same
464/// value can be used for both the phi and casts in the vector loop.
466 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
467 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
468 if (!IV || IV->getTruncInst())
469 continue;
470
471 // A sequence of IR Casts has potentially been recorded for IV, which
472 // *must be bypassed* when the IV is vectorized, because the vectorized IV
473 // will produce the desired casted value. This sequence forms a def-use
474 // chain and is provided in reverse order, ending with the cast that uses
475 // the IV phi. Search for the recipe of the last cast in the chain and
476 // replace it with the original IV. Note that only the final cast is
477 // expected to have users outside the cast-chain and the dead casts left
478 // over will be cleaned up later.
479 auto &Casts = IV->getInductionDescriptor().getCastInsts();
480 VPValue *FindMyCast = IV;
481 for (Instruction *IRCast : reverse(Casts)) {
482 VPSingleDefRecipe *FoundUserCast = nullptr;
483 for (auto *U : FindMyCast->users()) {
484 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
485 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
486 FoundUserCast = UserCast;
487 break;
488 }
489 }
490 FindMyCast = FoundUserCast;
491 }
492 FindMyCast->replaceAllUsesWith(IV);
493 }
494}
495
496/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
497/// recipe, if it exists.
499 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
500 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
501 for (VPUser *U : CanonicalIV->users()) {
502 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
503 if (WidenNewIV)
504 break;
505 }
506
507 if (!WidenNewIV)
508 return;
509
511 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
512 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
513
514 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
515 continue;
516
517 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
518 // everything WidenNewIV's users need. That is, WidenOriginalIV will
519 // generate a vector phi or all users of WidenNewIV demand the first lane
520 // only.
521 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
522 vputils::onlyFirstLaneUsed(WidenNewIV)) {
523 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
524 WidenNewIV->eraseFromParent();
525 return;
526 }
527 }
528}
529
530/// Returns true if \p R is dead and can be removed.
531static bool isDeadRecipe(VPRecipeBase &R) {
532 // Do remove conditional assume instructions as their conditions may be
533 // flattened.
534 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
535 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
536 match(RepR, m_Intrinsic<Intrinsic::assume>());
537 if (IsConditionalAssume)
538 return true;
539
540 if (R.mayHaveSideEffects())
541 return false;
542
543 // Recipe is dead if no user keeps the recipe alive.
544 return all_of(R.definedValues(),
545 [](VPValue *V) { return V->getNumUsers() == 0; });
546}
547
549 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
550 vp_post_order_deep(Plan.getEntry()))) {
551 // The recipes in the block are processed in reverse order, to catch chains
552 // of dead recipes.
553 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
554 if (isDeadRecipe(R)) {
555 R.eraseFromParent();
556 continue;
557 }
558
559 // Check if R is a dead VPPhi <-> update cycle and remove it.
560 auto *PhiR = dyn_cast<VPPhi>(&R);
561 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
562 continue;
563 VPValue *Incoming = PhiR->getOperand(1);
564 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
565 Incoming->getNumUsers() != 1)
566 continue;
567 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
568 PhiR->eraseFromParent();
569 Incoming->getDefiningRecipe()->eraseFromParent();
570 }
571 }
572}
573
576 Instruction::BinaryOps InductionOpcode,
577 FPMathOperator *FPBinOp, Instruction *TruncI,
578 VPValue *StartV, VPValue *Step, DebugLoc DL,
579 VPBuilder &Builder) {
581 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
582 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
583 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
584
585 // Truncate base induction if needed.
586 VPTypeAnalysis TypeInfo(Plan);
587 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
588 if (TruncI) {
589 Type *TruncTy = TruncI->getType();
590 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
591 "Not truncating.");
592 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
593 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
594 ResultTy = TruncTy;
595 }
596
597 // Truncate step if needed.
598 Type *StepTy = TypeInfo.inferScalarType(Step);
599 if (ResultTy != StepTy) {
600 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
601 "Not truncating.");
602 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
603 auto *VecPreheader =
604 cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());
605 VPBuilder::InsertPointGuard Guard(Builder);
606 Builder.setInsertPoint(VecPreheader);
607 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
608 }
609 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
610 &Plan.getVF(), DL);
611}
612
615 for (unsigned I = 0; I != Users.size(); ++I) {
616 VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);
617 if (isa<VPHeaderPHIRecipe>(Cur))
618 continue;
619 for (VPValue *V : Cur->definedValues())
620 Users.insert_range(V->users());
621 }
622 return Users.takeVector();
623}
624
625/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
626/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
627/// VPWidenPointerInductionRecipe will generate vectors only. If some users
628/// require vectors while other require scalars, the scalar uses need to extract
629/// the scalars from the generated vectors (Note that this is different to how
630/// int/fp inductions are handled). Legalize extract-from-ends using uniform
631/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
632/// the correct end value is available. Also optimize
633/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
634/// providing them scalar steps built on the canonical scalar IV and update the
635/// original IV's users. This is an optional optimization to reduce the needs of
636/// vector extracts.
639 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
640 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
641 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
642 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
643 if (!PhiR)
644 continue;
645
646 // Try to narrow wide and replicating recipes to uniform recipes, based on
647 // VPlan analysis.
648 // TODO: Apply to all recipes in the future, to replace legacy uniformity
649 // analysis.
650 auto Users = collectUsersRecursively(PhiR);
651 for (VPUser *U : reverse(Users)) {
652 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
653 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
654 // Skip recipes that shouldn't be narrowed.
655 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
656 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
657 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
658 continue;
659
660 // Skip recipes that may have other lanes than their first used.
662 continue;
663
664 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
665 Def->operands(), /*IsUniform*/ true);
666 Clone->insertAfter(Def);
667 Def->replaceAllUsesWith(Clone);
668 }
669
670 // Replace wide pointer inductions which have only their scalars used by
671 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
672 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
673 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
674 continue;
675
676 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
677 VPValue *StartV =
678 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
679 VPValue *StepV = PtrIV->getOperand(1);
681 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
682 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
683
684 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
685 PtrIV->getDebugLoc(), "next.gep");
686
687 PtrIV->replaceAllUsesWith(PtrAdd);
688 continue;
689 }
690
691 // Replace widened induction with scalar steps for users that only use
692 // scalars.
693 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
694 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
695 return U->usesScalars(WideIV);
696 }))
697 continue;
698
699 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
701 Plan, ID.getKind(), ID.getInductionOpcode(),
702 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
703 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
704 WideIV->getDebugLoc(), Builder);
705
706 // Update scalar users of IV to use Step instead.
707 if (!HasOnlyVectorVFs)
708 WideIV->replaceAllUsesWith(Steps);
709 else
710 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
711 return U.usesScalars(WideIV);
712 });
713 }
714}
715
716/// Check if \p VPV is an untruncated wide induction, either before or after the
717/// increment. If so return the header IV (before the increment), otherwise
718/// return null.
720 ScalarEvolution &SE) {
721 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
722 if (WideIV) {
723 // VPV itself is a wide induction, separately compute the end value for exit
724 // users if it is not a truncated IV.
725 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
726 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
727 }
728
729 // Check if VPV is an optimizable induction increment.
730 VPRecipeBase *Def = VPV->getDefiningRecipe();
731 if (!Def || Def->getNumOperands() != 2)
732 return nullptr;
733 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
734 if (!WideIV)
735 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
736 if (!WideIV)
737 return nullptr;
738
739 auto IsWideIVInc = [&]() {
740 auto &ID = WideIV->getInductionDescriptor();
741
742 // Check if VPV increments the induction by the induction step.
743 VPValue *IVStep = WideIV->getStepValue();
744 switch (ID.getInductionOpcode()) {
745 case Instruction::Add:
746 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
747 case Instruction::FAdd:
748 return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
749 m_Specific(IVStep)));
750 case Instruction::FSub:
751 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
752 m_Specific(IVStep)));
753 case Instruction::Sub: {
754 // IVStep will be the negated step of the subtraction. Check if Step == -1
755 // * IVStep.
756 VPValue *Step;
757 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
758 return false;
759 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
760 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
761 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
762 !isa<SCEVCouldNotCompute>(StepSCEV) &&
763 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
764 }
765 default:
766 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
767 match(VPV, m_GetElementPtr(m_Specific(WideIV),
768 m_Specific(WideIV->getStepValue())));
769 }
770 llvm_unreachable("should have been covered by switch above");
771 };
772 return IsWideIVInc() ? WideIV : nullptr;
773}
774
775/// Attempts to optimize the induction variable exit values for users in the
776/// early exit block.
778 VPTypeAnalysis &TypeInfo,
779 VPBlockBase *PredVPBB,
780 VPValue *Op,
781 ScalarEvolution &SE) {
782 VPValue *Incoming, *Mask;
783 if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
784 m_VPInstruction<VPInstruction::FirstActiveLane>(
785 m_VPValue(Mask)),
787 return nullptr;
788
789 auto *WideIV = getOptimizableIVOf(Incoming, SE);
790 if (!WideIV)
791 return nullptr;
792
793 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
794 if (WideIntOrFp && WideIntOrFp->getTruncInst())
795 return nullptr;
796
797 // Calculate the final index.
798 VPValue *EndValue = Plan.getCanonicalIV();
799 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
800 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
801
802 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
803 VPValue *FirstActiveLane =
804 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
805 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
806 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
807 FirstActiveLaneType, DL);
808 EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
809
810 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
811 // changed it means the exit is using the incremented value, so we need to
812 // add the step.
813 if (Incoming != WideIV) {
814 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
815 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
816 }
817
818 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
819 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
820 VPValue *Start = WideIV->getStartValue();
821 VPValue *Step = WideIV->getStepValue();
822 EndValue = B.createDerivedIV(
823 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
824 Start, EndValue, Step);
825 }
826
827 return EndValue;
828}
829
830/// Attempts to optimize the induction variable exit values for users in the
831/// exit block coming from the latch in the original scalar loop.
833 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
837 return nullptr;
838
839 auto *WideIV = getOptimizableIVOf(Incoming, SE);
840 if (!WideIV)
841 return nullptr;
842
843 VPValue *EndValue = EndValues.lookup(WideIV);
844 assert(EndValue && "end value must have been pre-computed");
845
846 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
847 // changed it means the exit is using the incremented value, so we don't
848 // need to subtract the step.
849 if (Incoming != WideIV)
850 return EndValue;
851
852 // Otherwise, subtract the step from the EndValue.
853 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
854 VPValue *Step = WideIV->getStepValue();
855 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
856 if (ScalarTy->isIntegerTy())
857 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
858 if (ScalarTy->isPointerTy()) {
859 Type *StepTy = TypeInfo.inferScalarType(Step);
860 auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
861 return B.createPtrAdd(EndValue,
862 B.createNaryOp(Instruction::Sub, {Zero, Step}), {},
863 "ind.escape");
864 }
865 if (ScalarTy->isFloatingPointTy()) {
866 const auto &ID = WideIV->getInductionDescriptor();
867 return B.createNaryOp(
868 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
869 ? Instruction::FSub
870 : Instruction::FAdd,
871 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
872 }
873 llvm_unreachable("all possible induction types must be handled");
874 return nullptr;
875}
876
878 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
879 ScalarEvolution &SE) {
880 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
881 VPTypeAnalysis TypeInfo(Plan);
882 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
883 for (VPRecipeBase &R : ExitVPBB->phis()) {
884 auto *ExitIRI = cast<VPIRPhi>(&R);
885
886 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
887 VPValue *Escape = nullptr;
888 if (PredVPBB == MiddleVPBB)
889 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
890 ExitIRI->getOperand(Idx),
891 EndValues, SE);
892 else
893 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
894 ExitIRI->getOperand(Idx), SE);
895 if (Escape)
896 ExitIRI->setOperand(Idx, Escape);
897 }
898 }
899 }
900}
901
902/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
903/// them with already existing recipes expanding the same SCEV expression.
906
907 for (VPRecipeBase &R :
909 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
910 if (!ExpR)
911 continue;
912
913 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
914 if (Inserted)
915 continue;
916 ExpR->replaceAllUsesWith(V->second);
917 ExpR->eraseFromParent();
918 }
919}
920
922 SmallVector<VPValue *> WorkList;
924 WorkList.push_back(V);
925
926 while (!WorkList.empty()) {
927 VPValue *Cur = WorkList.pop_back_val();
928 if (!Seen.insert(Cur).second)
929 continue;
931 if (!R)
932 continue;
933 if (!isDeadRecipe(*R))
934 continue;
935 WorkList.append(R->op_begin(), R->op_end());
936 R->eraseFromParent();
937 }
938}
939
940/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
941/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
942/// foldable live-ins.
943static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
945 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
947 for (VPValue *Op : Operands) {
948 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
949 return nullptr;
950 Ops.push_back(Op->getLiveInIRValue());
951 }
952
953 InstSimplifyFolder Folder(DL);
954 if (Instruction::isBinaryOp(Opcode))
955 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
956 Ops[1]);
957 if (Instruction::isCast(Opcode))
958 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
959 TypeInfo.inferScalarType(R.getVPSingleValue()));
960 switch (Opcode) {
962 return Folder.FoldSelect(Ops[0], Ops[1],
965 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
967 case Instruction::Select:
968 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
969 case Instruction::ICmp:
970 case Instruction::FCmp:
971 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
972 Ops[1]);
973 case Instruction::GetElementPtr: {
974 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
975 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
976 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
977 RFlags.getGEPNoWrapFlags());
978 }
981 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
982 Ops[1],
983 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
984 // An extract of a live-in is an extract of a broadcast, so return the
985 // broadcasted element.
986 case Instruction::ExtractElement:
987 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
988 return Ops[0];
989 }
990 return nullptr;
991}
992
993/// Try to simplify recipe \p R.
994static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
995 VPlan *Plan = R.getParent()->getPlan();
996
997 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
998 if (!Def)
999 return;
1000
1001 // Simplification of live-in IR values for SingleDef recipes using
1002 // InstSimplifyFolder.
1006 const DataLayout &DL =
1007 Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1008 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1009 TypeInfo);
1010 if (V)
1011 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1012 return V;
1013 })
1014 .Default([](auto *) { return false; }))
1015 return;
1016
1017 // Fold PredPHI LiveIn -> LiveIn.
1018 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1019 VPValue *Op = PredPHI->getOperand(0);
1020 if (Op->isLiveIn())
1021 PredPHI->replaceAllUsesWith(Op);
1022 }
1023
1024 VPValue *A;
1025 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1026 Type *TruncTy = TypeInfo.inferScalarType(Def);
1027 Type *ATy = TypeInfo.inferScalarType(A);
1028 if (TruncTy == ATy) {
1029 Def->replaceAllUsesWith(A);
1030 } else {
1031 // Don't replace a scalarizing recipe with a widened cast.
1032 if (isa<VPReplicateRecipe>(Def))
1033 return;
1034 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1035
1036 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1037 ? Instruction::SExt
1038 : Instruction::ZExt;
1039 auto *VPC =
1040 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1041 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1042 // UnderlyingExt has distinct return type, used to retain legacy cost.
1043 VPC->setUnderlyingValue(UnderlyingExt);
1044 }
1045 VPC->insertBefore(&R);
1046 Def->replaceAllUsesWith(VPC);
1047 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1048 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1049 VPC->insertBefore(&R);
1050 Def->replaceAllUsesWith(VPC);
1051 }
1052 }
1053#ifndef NDEBUG
1054 // Verify that the cached type info is for both A and its users is still
1055 // accurate by comparing it to freshly computed types.
1056 VPTypeAnalysis TypeInfo2(*Plan);
1057 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1058 for (VPUser *U : A->users()) {
1059 auto *R = cast<VPRecipeBase>(U);
1060 for (VPValue *VPV : R->definedValues())
1061 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1062 }
1063#endif
1064 }
1065
1066 // Simplify (X && Y) || (X && !Y) -> X.
1067 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1068 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1069 // recipes to be visited during simplification.
1070 VPValue *X, *Y;
1071 if (match(Def,
1074 Def->replaceAllUsesWith(X);
1075 Def->eraseFromParent();
1076 return;
1077 }
1078
1079 // OR x, 1 -> 1.
1080 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1081 Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)
1082 : Def->getOperand(0));
1083 Def->eraseFromParent();
1084 return;
1085 }
1086
1087 // AND x, 0 -> 0
1088 if (match(&R, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1089 Def->replaceAllUsesWith(R.getOperand(0) == X ? R.getOperand(1)
1090 : R.getOperand(0));
1091 return;
1092 }
1093
1094 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1095 return Def->replaceAllUsesWith(X);
1096
1097 // select !c, x, y -> select c, y, x
1098 VPValue *C;
1099 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1100 Def->setOperand(0, C);
1101 Def->setOperand(1, Y);
1102 Def->setOperand(2, X);
1103 return;
1104 }
1105
1106 if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
1107 return Def->replaceAllUsesWith(A);
1108
1109 if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))
1110 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1111 : R.getOperand(0));
1112
1113 if (match(Def, m_Not(m_VPValue(A)))) {
1114 if (match(A, m_Not(m_VPValue(A))))
1115 return Def->replaceAllUsesWith(A);
1116
1117 // Try to fold Not into compares by adjusting the predicate in-place.
1118 CmpPredicate Pred;
1119 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1120 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1121 if (all_of(Cmp->users(), [&Cmp](VPUser *U) {
1122 return match(U, m_CombineOr(m_Not(m_Specific(Cmp)),
1123 m_Select(m_Specific(Cmp), m_VPValue(),
1124 m_VPValue())));
1125 })) {
1126 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1127 for (VPUser *U : to_vector(Cmp->users())) {
1128 auto *R = cast<VPSingleDefRecipe>(U);
1129 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1130 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1131 R->setOperand(1, Y);
1132 R->setOperand(2, X);
1133 } else {
1134 // not (cmp pred) -> cmp inv_pred
1135 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1136 R->replaceAllUsesWith(Cmp);
1137 }
1138 }
1139 // If Cmp doesn't have a debug location, use the one from the negation,
1140 // to preserve the location.
1141 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1142 Cmp->setDebugLoc(R.getDebugLoc());
1143 }
1144 }
1145 }
1146
1147 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1148 if ((match(Def,
1150 match(Def,
1152 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1153 TypeInfo.inferScalarType(Def))
1154 return Def->replaceAllUsesWith(Def->getOperand(1));
1155
1156 if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(
1157 m_VPValue(X), m_SpecificInt(1)))) {
1158 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1159 if (TypeInfo.inferScalarType(X) != WideStepTy)
1160 X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy);
1161 Def->replaceAllUsesWith(X);
1162 return;
1163 }
1164
1165 // For i1 vp.merges produced by AnyOf reductions:
1166 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1167 if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
1168 m_VPValue(X), m_VPValue())) &&
1170 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1171 Def->setOperand(1, Def->getOperand(0));
1172 Def->setOperand(0, Y);
1173 return;
1174 }
1175
1176 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1177 if (Phi->getOperand(0) == Phi->getOperand(1))
1178 Def->replaceAllUsesWith(Phi->getOperand(0));
1179 return;
1180 }
1181
1182 // Look through ExtractLastElement (BuildVector ....).
1184 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1185 Def->replaceAllUsesWith(
1186 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1187 return;
1188 }
1189
1190 // Look through ExtractPenultimateElement (BuildVector ....).
1191 if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1192 m_BuildVector()))) {
1193 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1194 Def->replaceAllUsesWith(
1195 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1196 return;
1197 }
1198
1199 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1200 if (Phi->getNumOperands() == 1)
1201 Phi->replaceAllUsesWith(Phi->getOperand(0));
1202 return;
1203 }
1204
1205 // Some simplifications can only be applied after unrolling. Perform them
1206 // below.
1207 if (!Plan->isUnrolled())
1208 return;
1209
1210 // VPVectorPointer for part 0 can be replaced by their start pointer.
1211 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1212 if (VecPtr->isFirstPart()) {
1213 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1214 return;
1215 }
1216 }
1217
1218 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1219 // the first lane is demanded.
1220 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1221 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1222 Steps->replaceAllUsesWith(Steps->getOperand(0));
1223 return;
1224 }
1225 }
1226 // Simplify redundant ReductionStartVector recipes after unrolling.
1227 VPValue *StartV;
1228 if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(
1229 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1230 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1231 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1232 return PhiR && PhiR->isInLoop();
1233 });
1234 return;
1235 }
1236
1238 Def->replaceAllUsesWith(A);
1239 return;
1240 }
1241
1242 VPInstruction *OpVPI;
1243 if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) &&
1244 OpVPI->isVectorToScalar()) {
1245 Def->replaceAllUsesWith(OpVPI);
1246 return;
1247 }
1248}
1249
1252 Plan.getEntry());
1253 VPTypeAnalysis TypeInfo(Plan);
1254 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
1255 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1256 simplifyRecipe(R, TypeInfo);
1257 }
1258 }
1259}
1260
1262 if (Plan.hasScalarVFOnly())
1263 return;
1264
1265 // Try to narrow wide and replicating recipes to single scalar recipes,
1266 // based on VPlan analysis. Only process blocks in the loop region for now,
1267 // without traversing into nested regions, as recipes in replicate regions
1268 // cannot be converted yet.
1269 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1271 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1272 if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
1273 continue;
1274 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1275 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1276 continue;
1277
1278 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1279 // Skip recipes that aren't single scalars or don't have only their
1280 // scalar results used. In the latter case, we would introduce extra
1281 // broadcasts.
1282 if (!vputils::isSingleScalar(RepOrWidenR) ||
1283 !vputils::onlyScalarValuesUsed(RepOrWidenR))
1284 continue;
1285
1286 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1287 RepOrWidenR->operands(),
1288 true /*IsSingleScalar*/);
1289 Clone->insertBefore(RepOrWidenR);
1290 RepOrWidenR->replaceAllUsesWith(Clone);
1291 }
1292 }
1293}
1294
1295/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1296/// to make sure the masks are simplified.
1297static void simplifyBlends(VPlan &Plan) {
1298 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1300 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1301 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1302 if (!Blend)
1303 continue;
1304
1305 // Try to remove redundant blend recipes.
1306 SmallPtrSet<VPValue *, 4> UniqueValues;
1307 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1308 UniqueValues.insert(Blend->getIncomingValue(0));
1309 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1310 if (!match(Blend->getMask(I), m_False()))
1311 UniqueValues.insert(Blend->getIncomingValue(I));
1312
1313 if (UniqueValues.size() == 1) {
1314 Blend->replaceAllUsesWith(*UniqueValues.begin());
1315 Blend->eraseFromParent();
1316 continue;
1317 }
1318
1319 if (Blend->isNormalized())
1320 continue;
1321
1322 // Normalize the blend so its first incoming value is used as the initial
1323 // value with the others blended into it.
1324
1325 unsigned StartIndex = 0;
1326 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1327 // If a value's mask is used only by the blend then is can be deadcoded.
1328 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1329 // that's used by multiple blends where it can be removed from them all.
1330 VPValue *Mask = Blend->getMask(I);
1331 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1332 StartIndex = I;
1333 break;
1334 }
1335 }
1336
1337 SmallVector<VPValue *, 4> OperandsWithMask;
1338 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1339
1340 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1341 if (I == StartIndex)
1342 continue;
1343 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1344 OperandsWithMask.push_back(Blend->getMask(I));
1345 }
1346
1347 auto *NewBlend =
1348 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1349 OperandsWithMask, Blend->getDebugLoc());
1350 NewBlend->insertBefore(&R);
1351
1352 VPValue *DeadMask = Blend->getMask(StartIndex);
1353 Blend->replaceAllUsesWith(NewBlend);
1354 Blend->eraseFromParent();
1356
1357 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1358 VPValue *NewMask;
1359 if (NewBlend->getNumOperands() == 3 &&
1360 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1361 VPValue *Inc0 = NewBlend->getOperand(0);
1362 VPValue *Inc1 = NewBlend->getOperand(1);
1363 VPValue *OldMask = NewBlend->getOperand(2);
1364 NewBlend->setOperand(0, Inc1);
1365 NewBlend->setOperand(1, Inc0);
1366 NewBlend->setOperand(2, NewMask);
1367 if (OldMask->getNumUsers() == 0)
1368 cast<VPInstruction>(OldMask)->eraseFromParent();
1369 }
1370 }
1371 }
1372}
1373
1374/// Optimize the width of vector induction variables in \p Plan based on a known
1375/// constant Trip Count, \p BestVF and \p BestUF.
1377 ElementCount BestVF,
1378 unsigned BestUF) {
1379 // Only proceed if we have not completely removed the vector region.
1380 if (!Plan.getVectorLoopRegion())
1381 return false;
1382
1383 if (!Plan.getTripCount()->isLiveIn())
1384 return false;
1385 auto *TC = dyn_cast_if_present<ConstantInt>(
1387 if (!TC || !BestVF.isFixed())
1388 return false;
1389
1390 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1391 // and UF. Returns at least 8.
1392 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1393 APInt AlignedTC =
1396 APInt MaxVal = AlignedTC - 1;
1397 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1398 };
1399 unsigned NewBitWidth =
1400 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1401
1402 LLVMContext &Ctx = Plan.getContext();
1403 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1404
1405 bool MadeChange = false;
1406
1407 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1408 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1409 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1410
1411 // Currently only handle canonical IVs as it is trivial to replace the start
1412 // and stop values, and we currently only perform the optimization when the
1413 // IV has a single use.
1414 if (!WideIV || !WideIV->isCanonical() ||
1415 WideIV->hasMoreThanOneUniqueUser() ||
1416 NewIVTy == WideIV->getScalarType())
1417 continue;
1418
1419 // Currently only handle cases where the single user is a header-mask
1420 // comparison with the backedge-taken-count.
1421 if (!match(*WideIV->user_begin(),
1422 m_ICmp(m_Specific(WideIV),
1425 continue;
1426
1427 // Update IV operands and comparison bound to use new narrower type.
1428 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1429 WideIV->setStartValue(NewStart);
1430 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1431 WideIV->setStepValue(NewStep);
1432
1433 auto *NewBTC = new VPWidenCastRecipe(
1434 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1435 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1436 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1437 Cmp->setOperand(1, NewBTC);
1438
1439 MadeChange = true;
1440 }
1441
1442 return MadeChange;
1443}
1444
1445/// Return true if \p Cond is known to be true for given \p BestVF and \p
1446/// BestUF.
1448 ElementCount BestVF, unsigned BestUF,
1449 ScalarEvolution &SE) {
1451 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1452 &SE](VPValue *C) {
1453 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1454 });
1455
1456 auto *CanIV = Plan.getCanonicalIV();
1458 m_Specific(CanIV->getBackedgeValue()),
1459 m_Specific(&Plan.getVectorTripCount()))))
1460 return false;
1461
1462 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1463 // count is not conveniently available as SCEV so far, so we compare directly
1464 // against the original trip count. This is stricter than necessary, as we
1465 // will only return true if the trip count == vector trip count.
1466 const SCEV *VectorTripCount =
1468 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1469 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1470 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1471 "Trip count SCEV must be computable");
1472 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1473 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1474 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1475}
1476
1477/// Try to simplify the branch condition of \p Plan. This may restrict the
1478/// resulting plan to \p BestVF and \p BestUF.
1480 unsigned BestUF,
1482 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1483 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1484 auto *Term = &ExitingVPBB->back();
1485 VPValue *Cond;
1486 ScalarEvolution &SE = *PSE.getSE();
1487 if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1488 match(Term, m_BranchOnCond(
1490 // Try to simplify the branch condition if TC <= VF * UF when the latch
1491 // terminator is BranchOnCount or BranchOnCond where the input is
1492 // Not(ActiveLaneMask).
1493 const SCEV *TripCount =
1495 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1496 "Trip count SCEV must be computable");
1497 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1498 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1499 if (TripCount->isZero() ||
1500 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1501 return false;
1502 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1503 // For BranchOnCond, check if we can prove the condition to be true using VF
1504 // and UF.
1505 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1506 return false;
1507 } else {
1508 return false;
1509 }
1510
1511 // The vector loop region only executes once. If possible, completely remove
1512 // the region, otherwise replace the terminator controlling the latch with
1513 // (BranchOnCond true).
1514 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1515 // support for other non-canonical widen induction recipes (e.g.,
1516 // VPWidenPointerInductionRecipe).
1517 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1518 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1519 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1520 return R->isCanonical();
1521 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1522 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1523 })) {
1524 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1525 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1526 VPBuilder Builder(Plan.getVectorPreheader());
1527 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1528 R->getScalarType());
1529 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1530 HeaderR.eraseFromParent();
1531 continue;
1532 }
1533 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1534 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1535 HeaderR.eraseFromParent();
1536 }
1537
1538 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1539 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1540 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1541 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1542
1543 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1544 B->setParent(nullptr);
1545
1546 VPBlockUtils::connectBlocks(Preheader, Header);
1547 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1549 } else {
1550 // The vector region contains header phis for which we cannot remove the
1551 // loop region yet.
1552 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1553 Term->getDebugLoc());
1554 ExitingVPBB->appendRecipe(BOC);
1555 }
1556
1557 Term->eraseFromParent();
1558
1559 return true;
1560}
1561
1563 unsigned BestUF,
1565 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1566 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1567
1568 bool MadeChange =
1569 simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1570 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1571
1572 if (MadeChange) {
1573 Plan.setVF(BestVF);
1574 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1575 }
1576 // TODO: Further simplifications are possible
1577 // 1. Replace inductions with constants.
1578 // 2. Replace vector loop region with VPBasicBlock.
1579}
1580
1581/// Sink users of \p FOR after the recipe defining the previous value \p
1582/// Previous of the recurrence. \returns true if all users of \p FOR could be
1583/// re-arranged as needed or false if it is not possible.
1584static bool
1586 VPRecipeBase *Previous,
1587 VPDominatorTree &VPDT) {
1588 // Collect recipes that need sinking.
1591 Seen.insert(Previous);
1592 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1593 // The previous value must not depend on the users of the recurrence phi. In
1594 // that case, FOR is not a fixed order recurrence.
1595 if (SinkCandidate == Previous)
1596 return false;
1597
1598 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1599 !Seen.insert(SinkCandidate).second ||
1600 VPDT.properlyDominates(Previous, SinkCandidate))
1601 return true;
1602
1603 if (SinkCandidate->mayHaveSideEffects())
1604 return false;
1605
1606 WorkList.push_back(SinkCandidate);
1607 return true;
1608 };
1609
1610 // Recursively sink users of FOR after Previous.
1611 WorkList.push_back(FOR);
1612 for (unsigned I = 0; I != WorkList.size(); ++I) {
1613 VPRecipeBase *Current = WorkList[I];
1614 assert(Current->getNumDefinedValues() == 1 &&
1615 "only recipes with a single defined value expected");
1616
1617 for (VPUser *User : Current->getVPSingleValue()->users()) {
1618 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1619 return false;
1620 }
1621 }
1622
1623 // Keep recipes to sink ordered by dominance so earlier instructions are
1624 // processed first.
1625 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1626 return VPDT.properlyDominates(A, B);
1627 });
1628
1629 for (VPRecipeBase *SinkCandidate : WorkList) {
1630 if (SinkCandidate == FOR)
1631 continue;
1632
1633 SinkCandidate->moveAfter(Previous);
1634 Previous = SinkCandidate;
1635 }
1636 return true;
1637}
1638
1639/// Try to hoist \p Previous and its operands before all users of \p FOR.
1641 VPRecipeBase *Previous,
1642 VPDominatorTree &VPDT) {
1643 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1644 return false;
1645
1646 // Collect recipes that need hoisting.
1647 SmallVector<VPRecipeBase *> HoistCandidates;
1649 VPRecipeBase *HoistPoint = nullptr;
1650 // Find the closest hoist point by looking at all users of FOR and selecting
1651 // the recipe dominating all other users.
1652 for (VPUser *U : FOR->users()) {
1653 auto *R = cast<VPRecipeBase>(U);
1654 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1655 HoistPoint = R;
1656 }
1657 assert(all_of(FOR->users(),
1658 [&VPDT, HoistPoint](VPUser *U) {
1659 auto *R = cast<VPRecipeBase>(U);
1660 return HoistPoint == R ||
1661 VPDT.properlyDominates(HoistPoint, R);
1662 }) &&
1663 "HoistPoint must dominate all users of FOR");
1664
1665 auto NeedsHoisting = [HoistPoint, &VPDT,
1666 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1667 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1668 if (!HoistCandidate)
1669 return nullptr;
1670 VPRegionBlock *EnclosingLoopRegion =
1671 HoistCandidate->getParent()->getEnclosingLoopRegion();
1672 assert((!HoistCandidate->getParent()->getParent() ||
1673 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1674 "CFG in VPlan should still be flat, without replicate regions");
1675 // Hoist candidate was already visited, no need to hoist.
1676 if (!Visited.insert(HoistCandidate).second)
1677 return nullptr;
1678
1679 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1680 // hoisting.
1681 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1682 return nullptr;
1683
1684 // If we reached a recipe that dominates HoistPoint, we don't need to
1685 // hoist the recipe.
1686 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1687 return nullptr;
1688 return HoistCandidate;
1689 };
1690 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1691 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1692 // associated dependencies.
1693 return !HoistCandidate->mayHaveSideEffects();
1694 };
1695
1696 if (!NeedsHoisting(Previous->getVPSingleValue()))
1697 return true;
1698
1699 // Recursively try to hoist Previous and its operands before all users of FOR.
1700 HoistCandidates.push_back(Previous);
1701
1702 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1703 VPRecipeBase *Current = HoistCandidates[I];
1704 assert(Current->getNumDefinedValues() == 1 &&
1705 "only recipes with a single defined value expected");
1706 if (!CanHoist(Current))
1707 return false;
1708
1709 for (VPValue *Op : Current->operands()) {
1710 // If we reach FOR, it means the original Previous depends on some other
1711 // recurrence that in turn depends on FOR. If that is the case, we would
1712 // also need to hoist recipes involving the other FOR, which may break
1713 // dependencies.
1714 if (Op == FOR)
1715 return false;
1716
1717 if (auto *R = NeedsHoisting(Op))
1718 HoistCandidates.push_back(R);
1719 }
1720 }
1721
1722 // Order recipes to hoist by dominance so earlier instructions are processed
1723 // first.
1724 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1725 return VPDT.properlyDominates(A, B);
1726 });
1727
1728 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1729 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1730 HoistPoint->getIterator());
1731 }
1732
1733 return true;
1734}
1735
1737 VPBuilder &LoopBuilder) {
1738 VPDominatorTree VPDT;
1739 VPDT.recalculate(Plan);
1740
1742 for (VPRecipeBase &R :
1744 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
1745 RecurrencePhis.push_back(FOR);
1746
1747 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1749 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1750 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1751 // to terminate.
1752 while (auto *PrevPhi =
1753 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
1754 assert(PrevPhi->getParent() == FOR->getParent());
1755 assert(SeenPhis.insert(PrevPhi).second);
1756 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1757 }
1758
1759 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1760 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1761 return false;
1762
1763 // Introduce a recipe to combine the incoming and previous values of a
1764 // fixed-order recurrence.
1765 VPBasicBlock *InsertBlock = Previous->getParent();
1766 if (isa<VPHeaderPHIRecipe>(Previous))
1767 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1768 else
1769 LoopBuilder.setInsertPoint(InsertBlock,
1770 std::next(Previous->getIterator()));
1771
1772 auto *RecurSplice =
1774 {FOR, FOR->getBackedgeValue()});
1775
1776 FOR->replaceAllUsesWith(RecurSplice);
1777 // Set the first operand of RecurSplice to FOR again, after replacing
1778 // all users.
1779 RecurSplice->setOperand(0, FOR);
1780 }
1781 return true;
1782}
1783
1785 for (VPRecipeBase &R :
1787 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1788 if (!PhiR)
1789 continue;
1790 RecurKind RK = PhiR->getRecurrenceKind();
1791 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
1793 continue;
1794
1795 for (VPUser *U : collectUsersRecursively(PhiR))
1796 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1797 RecWithFlags->dropPoisonGeneratingFlags();
1798 }
1799 }
1800}
1801
1802/// Move loop-invariant recipes out of the vector loop region in \p Plan.
1803static void licm(VPlan &Plan) {
1804 VPBasicBlock *Preheader = Plan.getVectorPreheader();
1805
1806 // Return true if we do not know how to (mechanically) hoist a given recipe
1807 // out of a loop region. Does not address legality concerns such as aliasing
1808 // or speculation safety.
1809 auto CannotHoistRecipe = [](VPRecipeBase &R) {
1810 // Allocas cannot be hoisted.
1811 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1812 return RepR && RepR->getOpcode() == Instruction::Alloca;
1813 };
1814
1815 // Hoist any loop invariant recipes from the vector loop region to the
1816 // preheader. Preform a shallow traversal of the vector loop region, to
1817 // exclude recipes in replicate regions.
1818 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1819 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1820 vp_depth_first_shallow(LoopRegion->getEntry()))) {
1821 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1822 if (CannotHoistRecipe(R))
1823 continue;
1824 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
1825 // their memory location is not modified in the vector loop.
1826 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
1827 any_of(R.operands(), [](VPValue *Op) {
1828 return !Op->isDefinedOutsideLoopRegions();
1829 }))
1830 continue;
1831 R.moveBefore(*Preheader, Preheader->end());
1832 }
1833 }
1834}
1835
1837 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1838 // Keep track of created truncates, so they can be re-used. Note that we
1839 // cannot use RAUW after creating a new truncate, as this would could make
1840 // other uses have different types for their operands, making them invalidly
1841 // typed.
1843 VPTypeAnalysis TypeInfo(Plan);
1844 VPBasicBlock *PH = Plan.getVectorPreheader();
1845 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1847 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1850 &R))
1851 continue;
1852
1853 VPValue *ResultVPV = R.getVPSingleValue();
1854 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
1855 unsigned NewResSizeInBits = MinBWs.lookup(UI);
1856 if (!NewResSizeInBits)
1857 continue;
1858
1859 // If the value wasn't vectorized, we must maintain the original scalar
1860 // type. Skip those here, after incrementing NumProcessedRecipes. Also
1861 // skip casts which do not need to be handled explicitly here, as
1862 // redundant casts will be removed during recipe simplification.
1863 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))
1864 continue;
1865
1866 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
1867 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1868 assert(OldResTy->isIntegerTy() && "only integer types supported");
1869 (void)OldResSizeInBits;
1870
1871 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
1872
1873 // Any wrapping introduced by shrinking this operation shouldn't be
1874 // considered undefined behavior. So, we can't unconditionally copy
1875 // arithmetic wrapping flags to VPW.
1876 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
1877 VPW->dropPoisonGeneratingFlags();
1878
1879 if (OldResSizeInBits != NewResSizeInBits &&
1880 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
1881 // Extend result to original width.
1882 auto *Ext =
1883 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
1884 Ext->insertAfter(&R);
1885 ResultVPV->replaceAllUsesWith(Ext);
1886 Ext->setOperand(0, ResultVPV);
1887 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1888 } else {
1889 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
1890 "Only ICmps should not need extending the result.");
1891 }
1892
1893 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1894 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R))
1895 continue;
1896
1897 // Shrink operands by introducing truncates as needed.
1898 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
1899 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1900 auto *Op = R.getOperand(Idx);
1901 unsigned OpSizeInBits =
1903 if (OpSizeInBits == NewResSizeInBits)
1904 continue;
1905 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1906 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
1907 VPWidenCastRecipe *NewOp =
1908 IterIsEmpty
1909 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
1910 : ProcessedIter->second;
1911 R.setOperand(Idx, NewOp);
1912 if (!IterIsEmpty)
1913 continue;
1914 ProcessedIter->second = NewOp;
1915 if (!Op->isLiveIn()) {
1916 NewOp->insertBefore(&R);
1917 } else {
1918 PH->appendRecipe(NewOp);
1919 }
1920 }
1921
1922 }
1923 }
1924}
1925
1927 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1929 VPValue *Cond;
1930 if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
1931 !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
1932 continue;
1933
1934 unsigned RemovedIdx;
1935 if (match(Cond, m_True()))
1936 RemovedIdx = 1;
1937 else if (match(Cond, m_False()))
1938 RemovedIdx = 0;
1939 else
1940 continue;
1941
1942 VPBasicBlock *RemovedSucc =
1943 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
1944 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
1945 "There must be a single edge between VPBB and its successor");
1946 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
1947 // these recipes.
1948 for (VPRecipeBase &R : RemovedSucc->phis())
1949 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
1950
1951 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
1952 // automatically on VPlan destruction if it becomes unreachable.
1953 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
1954 VPBB->back().eraseFromParent();
1955 }
1956}
1957
1961
1962 runPass(simplifyRecipes, Plan);
1963 runPass(simplifyBlends, Plan);
1968 runPass(simplifyRecipes, Plan);
1971
1974 runPass(licm, Plan);
1975}
1976
1977// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1978// the loop terminator with a branch-on-cond recipe with the negated
1979// active-lane-mask as operand. Note that this turns the loop into an
1980// uncountable one. Only the existing terminator is replaced, all other existing
1981// recipes/users remain unchanged, except for poison-generating flags being
1982// dropped from the canonical IV increment. Return the created
1983// VPActiveLaneMaskPHIRecipe.
1984//
1985// The function uses the following definitions:
1986//
1987// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1988// calculate-trip-count-minus-VF (original TC) : original TC
1989// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1990// CanonicalIVPhi : CanonicalIVIncrement
1991// %StartV is the canonical induction start value.
1992//
1993// The function adds the following recipes:
1994//
1995// vector.ph:
1996// %TripCount = calculate-trip-count-minus-VF (original TC)
1997// [if DataWithControlFlowWithoutRuntimeCheck]
1998// %EntryInc = canonical-iv-increment-for-part %StartV
1999// %EntryALM = active-lane-mask %EntryInc, %TripCount
2000//
2001// vector.body:
2002// ...
2003// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2004// ...
2005// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2006// %ALM = active-lane-mask %InLoopInc, TripCount
2007// %Negated = Not %ALM
2008// branch-on-cond %Negated
2009//
2012 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2013 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2014 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2015 VPValue *StartV = CanonicalIVPHI->getStartValue();
2016
2017 auto *CanonicalIVIncrement =
2018 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2019 // TODO: Check if dropping the flags is needed if
2020 // !DataAndControlFlowWithoutRuntimeCheck.
2021 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2022 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2023 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2024 // we have to take unrolling into account. Each part needs to start at
2025 // Part * VF
2026 auto *VecPreheader = Plan.getVectorPreheader();
2027 VPBuilder Builder(VecPreheader);
2028
2029 // Create the ActiveLaneMask instruction using the correct start values.
2030 VPValue *TC = Plan.getTripCount();
2031
2032 VPValue *TripCount, *IncrementValue;
2034 // When the loop is guarded by a runtime overflow check for the loop
2035 // induction variable increment by VF, we can increment the value before
2036 // the get.active.lane mask and use the unmodified tripcount.
2037 IncrementValue = CanonicalIVIncrement;
2038 TripCount = TC;
2039 } else {
2040 // When avoiding a runtime check, the active.lane.mask inside the loop
2041 // uses a modified trip count and the induction variable increment is
2042 // done after the active.lane.mask intrinsic is called.
2043 IncrementValue = CanonicalIVPHI;
2045 {TC}, DL);
2046 }
2047 auto *EntryIncrement = Builder.createOverflowingOp(
2048 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2049 "index.part.next");
2050
2051 // Create the active lane mask instruction in the VPlan preheader.
2052 auto *EntryALM =
2053 Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
2054 DL, "active.lane.mask.entry");
2055
2056 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2057 // preheader ActiveLaneMask instruction.
2058 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
2059 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2060
2061 // Create the active lane mask for the next iteration of the loop before the
2062 // original terminator.
2063 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2064 Builder.setInsertPoint(OriginalTerminator);
2065 auto *InLoopIncrement =
2067 {IncrementValue}, {false, false}, DL);
2068 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2069 {InLoopIncrement, TripCount}, DL,
2070 "active.lane.mask.next");
2071 LaneMaskPhi->addOperand(ALM);
2072
2073 // Replace the original terminator with BranchOnCond. We have to invert the
2074 // mask here because a true condition means jumping to the exit block.
2075 auto *NotMask = Builder.createNot(ALM, DL);
2076 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2077 OriginalTerminator->eraseFromParent();
2078 return LaneMaskPhi;
2079}
2080
2081/// Collect the header mask with the pattern:
2082/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2083/// TODO: Introduce explicit recipe for header-mask instead of searching
2084/// for the header-mask pattern manually.
2086 SmallVector<VPValue *> WideCanonicalIVs;
2087 auto *FoundWidenCanonicalIVUser =
2088 find_if(Plan.getCanonicalIV()->users(),
2089 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
2091 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
2092 1 &&
2093 "Must have at most one VPWideCanonicalIVRecipe");
2094 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2095 auto *WideCanonicalIV =
2096 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2097 WideCanonicalIVs.push_back(WideCanonicalIV);
2098 }
2099
2100 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2101 // version of the canonical induction.
2102 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2103 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2104 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2105 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2106 WideCanonicalIVs.push_back(WidenOriginalIV);
2107 }
2108
2109 // Walk users of wide canonical IVs and find the single compare of the form
2110 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2111 VPSingleDefRecipe *HeaderMask = nullptr;
2112 for (auto *Wide : WideCanonicalIVs) {
2113 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2114 auto *VPI = dyn_cast<VPInstruction>(U);
2115 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2116 continue;
2117
2118 assert(VPI->getOperand(0) == Wide &&
2119 "WidenCanonicalIV must be the first operand of the compare");
2120 assert(!HeaderMask && "Multiple header masks found?");
2121 HeaderMask = VPI;
2122 }
2123 }
2124 return HeaderMask;
2125}
2126
2128 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2131 UseActiveLaneMaskForControlFlow) &&
2132 "DataAndControlFlowWithoutRuntimeCheck implies "
2133 "UseActiveLaneMaskForControlFlow");
2134
2135 auto *FoundWidenCanonicalIVUser =
2136 find_if(Plan.getCanonicalIV()->users(),
2137 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
2138 assert(FoundWidenCanonicalIVUser &&
2139 "Must have widened canonical IV when tail folding!");
2140 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2141 auto *WideCanonicalIV =
2142 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2143 VPSingleDefRecipe *LaneMask;
2144 if (UseActiveLaneMaskForControlFlow) {
2147 } else {
2148 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2149 LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
2150 {WideCanonicalIV, Plan.getTripCount()}, nullptr,
2151 "active.lane.mask");
2152 }
2153
2154 // Walk users of WideCanonicalIV and replace the header mask of the form
2155 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2156 // removing the old one to ensure there is always only a single header mask.
2157 HeaderMask->replaceAllUsesWith(LaneMask);
2158 HeaderMask->eraseFromParent();
2159}
2160
2161/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2162/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2163/// recipe could be created.
2164/// \p HeaderMask Header Mask.
2165/// \p CurRecipe Recipe to be transform.
2166/// \p TypeInfo VPlan-based type analysis.
2167/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2168/// \p EVL The explicit vector length parameter of vector-predication
2169/// intrinsics.
2171 VPRecipeBase &CurRecipe,
2172 VPTypeAnalysis &TypeInfo,
2173 VPValue &AllOneMask, VPValue &EVL) {
2174 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2175 // header mask.
2176 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2177 assert(OrigMask && "Unmasked recipe when folding tail");
2178 // HeaderMask will be handled using EVL.
2179 VPValue *Mask;
2180 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2181 return Mask;
2182 return HeaderMask == OrigMask ? nullptr : OrigMask;
2183 };
2184
2185 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2186 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2187 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2188 if (!EndPtr)
2189 return Addr;
2190 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2191 "VPVectorEndPointerRecipe with non-VF VF operand?");
2192 assert(
2193 all_of(EndPtr->users(),
2194 [](VPUser *U) {
2195 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2196 }) &&
2197 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2198 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2199 EVLAddr->insertBefore(&CurRecipe);
2200 EVLAddr->setOperand(1, &EVL);
2201 return EVLAddr;
2202 };
2203
2206 VPValue *NewMask = GetNewMask(L->getMask());
2207 VPValue *NewAddr = GetNewAddr(L->getAddr());
2208 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2209 })
2210 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2211 VPValue *NewMask = GetNewMask(S->getMask());
2212 VPValue *NewAddr = GetNewAddr(S->getAddr());
2213 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2214 })
2215 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2216 VPValue *NewMask = GetNewMask(Red->getCondOp());
2217 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2218 })
2219 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2220 VPValue *LHS, *RHS;
2221 // Transform select with a header mask condition
2222 // select(header_mask, LHS, RHS)
2223 // into vector predication merge.
2224 // vp.merge(all-true, LHS, RHS, EVL)
2225 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2226 m_VPValue(RHS))))
2227 return nullptr;
2228 // Use all true as the condition because this transformation is
2229 // limited to selects whose condition is a header mask.
2230 return new VPWidenIntrinsicRecipe(
2231 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2232 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2233 })
2234 .Default([&](VPRecipeBase *R) { return nullptr; });
2235}
2236
2237/// Replace recipes with their EVL variants.
2239 VPTypeAnalysis TypeInfo(Plan);
2240 VPValue *AllOneMask = Plan.getTrue();
2241 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2242 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2243
2244 assert(all_of(Plan.getVF().users(),
2247 "User of VF that we can't transform to EVL.");
2248 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2249 return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
2250 });
2251
2252 assert(all_of(Plan.getVFxUF().users(),
2253 [&Plan](VPUser *U) {
2254 return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),
2255 m_Specific(&Plan.getVFxUF()))) ||
2256 isa<VPWidenPointerInductionRecipe>(U);
2257 }) &&
2258 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2259 "increment of the canonical induction.");
2260 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2261 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2262 // canonical induction must not be updated.
2263 return isa<VPWidenPointerInductionRecipe>(U);
2264 });
2265
2266 // Defer erasing recipes till the end so that we don't invalidate the
2267 // VPTypeAnalysis cache.
2269
2270 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2271 // contained.
2272 bool ContainsFORs =
2273 any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2274 if (ContainsFORs) {
2275 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2276 VPValue *MaxEVL = &Plan.getVF();
2277 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2278 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2279 MaxEVL = Builder.createScalarZExtOrTrunc(
2280 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2281 TypeInfo.inferScalarType(MaxEVL), DebugLoc());
2282
2283 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2284 VPValue *PrevEVL =
2285 Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
2286
2287 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2289 for (VPRecipeBase &R : *VPBB) {
2290 VPValue *V1, *V2;
2291 if (!match(&R,
2292 m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
2293 m_VPValue(V1), m_VPValue(V2))))
2294 continue;
2295 VPValue *Imm = Plan.getOrAddLiveIn(
2298 Intrinsic::experimental_vp_splice,
2299 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2300 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2301 VPSplice->insertBefore(&R);
2302 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2303 ToErase.push_back(&R);
2304 }
2305 }
2306 }
2307
2308 VPValue *HeaderMask = findHeaderMask(Plan);
2309 if (!HeaderMask)
2310 return;
2311
2312 // Replace header masks with a mask equivalent to predicating by EVL:
2313 //
2314 // icmp ule widen-canonical-iv backedge-taken-count
2315 // ->
2316 // icmp ult step-vector, EVL
2317 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2318 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2319 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2320 VPValue *EVLMask = Builder.createICmp(
2322 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2323 HeaderMask->replaceAllUsesWith(EVLMask);
2324 ToErase.push_back(HeaderMask->getDefiningRecipe());
2325
2326 // Try to optimize header mask recipes away to their EVL variants.
2327 // TODO: Split optimizeMaskToEVL out and move into
2328 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2329 // tryToBuildVPlanWithVPRecipes beforehand.
2330 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2331 auto *CurRecipe = cast<VPRecipeBase>(U);
2332 VPRecipeBase *EVLRecipe =
2333 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2334 if (!EVLRecipe)
2335 continue;
2336
2337 [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2338 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2339 "New recipe must define the same number of values as the "
2340 "original.");
2341 assert(NumDefVal <= 1 &&
2342 "Only supports recipes with a single definition or without users.");
2343 EVLRecipe->insertBefore(CurRecipe);
2344 if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
2345 VPValue *CurVPV = CurRecipe->getVPSingleValue();
2346 CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
2347 }
2348 ToErase.push_back(CurRecipe);
2349 }
2350 // Remove dead EVL mask.
2351 if (EVLMask->getNumUsers() == 0)
2352 ToErase.push_back(EVLMask->getDefiningRecipe());
2353
2354 for (VPRecipeBase *R : reverse(ToErase)) {
2355 SmallVector<VPValue *> PossiblyDead(R->operands());
2356 R->eraseFromParent();
2357 for (VPValue *Op : PossiblyDead)
2359 }
2360}
2361
2362/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2363/// replaces all uses except the canonical IV increment of
2364/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2365/// is used only for loop iterations counting after this transformation.
2366///
2367/// The function uses the following definitions:
2368/// %StartV is the canonical induction start value.
2369///
2370/// The function adds the following recipes:
2371///
2372/// vector.ph:
2373/// ...
2374///
2375/// vector.body:
2376/// ...
2377/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2378/// [ %NextEVLIV, %vector.body ]
2379/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2380/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2381/// ...
2382/// %OpEVL = cast i32 %VPEVL to IVSize
2383/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2384/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2385/// ...
2386///
2387/// If MaxSafeElements is provided, the function adds the following recipes:
2388/// vector.ph:
2389/// ...
2390///
2391/// vector.body:
2392/// ...
2393/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2394/// [ %NextEVLIV, %vector.body ]
2395/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2396/// %cmp = cmp ult %AVL, MaxSafeElements
2397/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2398/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2399/// ...
2400/// %OpEVL = cast i32 %VPEVL to IVSize
2401/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2402/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2403/// ...
2404///
2406 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2408
2409 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2410 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2411 VPValue *StartV = CanonicalIVPHI->getStartValue();
2412
2413 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2414 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
2415 EVLPhi->insertAfter(CanonicalIVPHI);
2416 VPBuilder Builder(Header, Header->getFirstNonPhi());
2417 // Create the AVL (application vector length), starting from TC -> 0 in steps
2418 // of EVL.
2419 VPPhi *AVLPhi = Builder.createScalarPhi(
2420 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2421 VPValue *AVL = AVLPhi;
2422
2423 if (MaxSafeElements) {
2424 // Support for MaxSafeDist for correct loop emission.
2425 VPValue *AVLSafe =
2426 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2427 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2428 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
2429 }
2430 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2431 DebugLoc());
2432
2433 auto *CanonicalIVIncrement =
2434 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2435 Builder.setInsertPoint(CanonicalIVIncrement);
2436 VPValue *OpVPEVL = VPEVL;
2437
2438 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2439 OpVPEVL = Builder.createScalarZExtOrTrunc(
2440 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2441
2442 auto *NextEVLIV = Builder.createOverflowingOp(
2443 Instruction::Add, {OpVPEVL, EVLPhi},
2444 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2445 CanonicalIVIncrement->hasNoSignedWrap()},
2446 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2447 EVLPhi->addOperand(NextEVLIV);
2448
2449 VPValue *NextAVL = Builder.createOverflowingOp(
2450 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2451 DebugLoc::getCompilerGenerated(), "avl.next");
2452 AVLPhi->addOperand(NextAVL);
2453
2454 transformRecipestoEVLRecipes(Plan, *VPEVL);
2455
2456 // Replace all uses of VPCanonicalIVPHIRecipe by
2457 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2458 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2459 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2460 // TODO: support unroll factor > 1.
2461 Plan.setUF(1);
2462}
2463
2465 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2466 // There should be only one EVL PHI in the entire plan.
2467 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2468
2469 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2471 for (VPRecipeBase &R : VPBB->phis())
2472 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2473 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2474 EVLPhi = PhiR;
2475 }
2476
2477 // Early return if no EVL PHI is found.
2478 if (!EVLPhi)
2479 return;
2480
2481 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2482 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2483 VPValue *AVL;
2484 [[maybe_unused]] bool FoundAVL =
2485 match(EVLIncrement,
2486 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2487 assert(FoundAVL && "Didn't find AVL?");
2488
2489 // The AVL may be capped to a safe distance.
2490 VPValue *SafeAVL;
2491 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2492 AVL = SafeAVL;
2493
2494 VPValue *AVLNext;
2495 [[maybe_unused]] bool FoundAVLNext =
2496 match(AVL, m_VPInstruction<Instruction::PHI>(
2497 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2498 assert(FoundAVLNext && "Didn't find AVL backedge?");
2499
2500 // Convert EVLPhi to concrete recipe.
2501 auto *ScalarR =
2502 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2503 EVLPhi->getDebugLoc(), "evl.based.iv");
2504 EVLPhi->replaceAllUsesWith(ScalarR);
2505 EVLPhi->eraseFromParent();
2506
2507 // Replace CanonicalIVInc with EVL-PHI increment.
2508 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2509 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2510 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2511 m_Specific(&Plan.getVFxUF()))) &&
2512 "Unexpected canonical iv");
2513 Backedge->replaceAllUsesWith(EVLIncrement);
2514
2515 // Remove unused phi and increment.
2516 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2517 CanonicalIVIncrement->eraseFromParent();
2518 CanonicalIV->eraseFromParent();
2519
2520 // Replace the use of VectorTripCount in the latch-exiting block.
2521 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2522 // After: (branch-on-cond eq AVLNext, 0)
2523
2524 VPBasicBlock *LatchExiting =
2525 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2526 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2527 // Skip single-iteration loop region
2528 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2529 return;
2530 assert(LatchExitingBr &&
2531 match(LatchExitingBr,
2532 m_BranchOnCount(m_VPValue(EVLIncrement),
2533 m_Specific(&Plan.getVectorTripCount()))) &&
2534 "Unexpected terminator in EVL loop");
2535
2536 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2537 VPBuilder Builder(LatchExitingBr);
2538 VPValue *Cmp =
2539 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2542 LatchExitingBr->eraseFromParent();
2543}
2544
2546 VPlan &Plan,
2547 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2548 // Collect recipes in the backward slice of `Root` that may generate a poison
2549 // value that is used after vectorization.
2551 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2553 Worklist.push_back(Root);
2554
2555 // Traverse the backward slice of Root through its use-def chain.
2556 while (!Worklist.empty()) {
2557 VPRecipeBase *CurRec = Worklist.pop_back_val();
2558
2559 if (!Visited.insert(CurRec).second)
2560 continue;
2561
2562 // Prune search if we find another recipe generating a widen memory
2563 // instruction. Widen memory instructions involved in address computation
2564 // will lead to gather/scatter instructions, which don't need to be
2565 // handled.
2566 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2567 VPHeaderPHIRecipe>(CurRec))
2568 continue;
2569
2570 // This recipe contributes to the address computation of a widen
2571 // load/store. If the underlying instruction has poison-generating flags,
2572 // drop them directly.
2573 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2574 VPValue *A, *B;
2575 // Dropping disjoint from an OR may yield incorrect results, as some
2576 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2577 // for dependence analysis). Instead, replace it with an equivalent Add.
2578 // This is possible as all users of the disjoint OR only access lanes
2579 // where the operands are disjoint or poison otherwise.
2580 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
2581 RecWithFlags->isDisjoint()) {
2582 VPBuilder Builder(RecWithFlags);
2583 VPInstruction *New = Builder.createOverflowingOp(
2584 Instruction::Add, {A, B}, {false, false},
2585 RecWithFlags->getDebugLoc());
2586 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2587 RecWithFlags->replaceAllUsesWith(New);
2588 RecWithFlags->eraseFromParent();
2589 CurRec = New;
2590 } else
2591 RecWithFlags->dropPoisonGeneratingFlags();
2592 } else {
2593 Instruction *Instr = dyn_cast_or_null<Instruction>(
2594 CurRec->getVPSingleValue()->getUnderlyingValue());
2595 (void)Instr;
2596 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2597 "found instruction with poison generating flags not covered by "
2598 "VPRecipeWithIRFlags");
2599 }
2600
2601 // Add new definitions to the worklist.
2602 for (VPValue *Operand : CurRec->operands())
2603 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2604 Worklist.push_back(OpDef);
2605 }
2606 });
2607
2608 // Traverse all the recipes in the VPlan and collect the poison-generating
2609 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2610 // VPInterleaveRecipe.
2611 auto Iter = vp_depth_first_deep(Plan.getEntry());
2612 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
2613 for (VPRecipeBase &Recipe : *VPBB) {
2614 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
2615 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2616 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2617 if (AddrDef && WidenRec->isConsecutive() &&
2618 BlockNeedsPredication(UnderlyingInstr.getParent()))
2619 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2620 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
2621 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2622 if (AddrDef) {
2623 // Check if any member of the interleave group needs predication.
2624 const InterleaveGroup<Instruction> *InterGroup =
2625 InterleaveRec->getInterleaveGroup();
2626 bool NeedPredication = false;
2627 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2628 I < NumMembers; ++I) {
2629 Instruction *Member = InterGroup->getMember(I);
2630 if (Member)
2631 NeedPredication |= BlockNeedsPredication(Member->getParent());
2632 }
2633
2634 if (NeedPredication)
2635 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2636 }
2637 }
2638 }
2639 }
2640}
2641
2643 VPlan &Plan,
2645 &InterleaveGroups,
2646 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2647 if (InterleaveGroups.empty())
2648 return;
2649
2650 // Interleave memory: for each Interleave Group we marked earlier as relevant
2651 // for this VPlan, replace the Recipes widening its memory instructions with a
2652 // single VPInterleaveRecipe at its insertion point.
2653 VPDominatorTree VPDT;
2654 VPDT.recalculate(Plan);
2655 for (const auto *IG : InterleaveGroups) {
2656 auto *Start =
2657 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
2658 VPIRMetadata InterleaveMD(*Start);
2659 SmallVector<VPValue *, 4> StoredValues;
2660 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
2661 StoredValues.push_back(StoreR->getStoredValue());
2662 for (unsigned I = 1; I < IG->getFactor(); ++I) {
2663 Instruction *MemberI = IG->getMember(I);
2664 if (!MemberI)
2665 continue;
2666 VPWidenMemoryRecipe *MemoryR =
2667 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
2668 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
2669 StoredValues.push_back(StoreR->getStoredValue());
2670 InterleaveMD.intersect(*MemoryR);
2671 }
2672
2673 bool NeedsMaskForGaps =
2674 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
2675 (!StoredValues.empty() && !IG->isFull());
2676
2677 Instruction *IRInsertPos = IG->getInsertPos();
2678 auto *InsertPos =
2679 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
2680
2682 if (auto *Gep = dyn_cast<GetElementPtrInst>(
2683 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
2684 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
2685
2686 // Get or create the start address for the interleave group.
2687 VPValue *Addr = Start->getAddr();
2688 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2689 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
2690 // We cannot re-use the address of member zero because it does not
2691 // dominate the insert position. Instead, use the address of the insert
2692 // position and create a PtrAdd adjusting it to the address of member
2693 // zero.
2694 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2695 // InsertPos or sink loads above zero members to join it.
2696 assert(IG->getIndex(IRInsertPos) != 0 &&
2697 "index of insert position shouldn't be zero");
2698 auto &DL = IRInsertPos->getDataLayout();
2699 APInt Offset(32,
2700 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
2701 IG->getIndex(IRInsertPos),
2702 /*IsSigned=*/true);
2703 VPValue *OffsetVPV =
2704 Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
2705 VPBuilder B(InsertPos);
2706 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
2707 }
2708 // If the group is reverse, adjust the index to refer to the last vector
2709 // lane instead of the first. We adjust the index from the first vector
2710 // lane, rather than directly getting the pointer for lane VF - 1, because
2711 // the pointer operand of the interleaved access is supposed to be uniform.
2712 if (IG->isReverse()) {
2713 auto *ReversePtr = new VPVectorEndPointerRecipe(
2714 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
2715 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
2716 ReversePtr->insertBefore(InsertPos);
2717 Addr = ReversePtr;
2718 }
2719 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
2720 InsertPos->getMask(), NeedsMaskForGaps,
2721 InterleaveMD, InsertPos->getDebugLoc());
2722 VPIG->insertBefore(InsertPos);
2723
2724 unsigned J = 0;
2725 for (unsigned i = 0; i < IG->getFactor(); ++i)
2726 if (Instruction *Member = IG->getMember(i)) {
2727 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
2728 if (!Member->getType()->isVoidTy()) {
2729 VPValue *OriginalV = MemberR->getVPSingleValue();
2730 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
2731 J++;
2732 }
2733 MemberR->eraseFromParent();
2734 }
2735 }
2736}
2737
2738/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
2739/// value, phi and backedge value. In the following example:
2740///
2741/// vector.ph:
2742/// Successor(s): vector loop
2743///
2744/// <x1> vector loop: {
2745/// vector.body:
2746/// WIDEN-INDUCTION %i = phi %start, %step, %vf
2747/// ...
2748/// EMIT branch-on-count ...
2749/// No successors
2750/// }
2751///
2752/// WIDEN-INDUCTION will get expanded to:
2753///
2754/// vector.ph:
2755/// ...
2756/// vp<%induction.start> = ...
2757/// vp<%induction.increment> = ...
2758///
2759/// Successor(s): vector loop
2760///
2761/// <x1> vector loop: {
2762/// vector.body:
2763/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
2764/// ...
2765/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
2766/// EMIT branch-on-count ...
2767/// No successors
2768/// }
2769static void
2771 VPTypeAnalysis &TypeInfo) {
2772 VPlan *Plan = WidenIVR->getParent()->getPlan();
2773 VPValue *Start = WidenIVR->getStartValue();
2774 VPValue *Step = WidenIVR->getStepValue();
2775 VPValue *VF = WidenIVR->getVFValue();
2776 DebugLoc DL = WidenIVR->getDebugLoc();
2777
2778 // The value from the original loop to which we are mapping the new induction
2779 // variable.
2780 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
2781
2782 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
2785 // FIXME: The newly created binary instructions should contain nsw/nuw
2786 // flags, which can be found from the original scalar operations.
2787 VPIRFlags Flags;
2788 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
2789 AddOp = Instruction::Add;
2790 MulOp = Instruction::Mul;
2791 } else {
2792 AddOp = ID.getInductionOpcode();
2793 MulOp = Instruction::FMul;
2794 Flags = ID.getInductionBinOp()->getFastMathFlags();
2795 }
2796
2797 // If the phi is truncated, truncate the start and step values.
2798 VPBuilder Builder(Plan->getVectorPreheader());
2799 Type *StepTy = TypeInfo.inferScalarType(Step);
2800 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
2801 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
2802 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
2803 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
2804 StepTy = Ty;
2805 }
2806
2807 // Construct the initial value of the vector IV in the vector loop preheader.
2808 Type *IVIntTy =
2810 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
2811 if (StepTy->isFloatingPointTy())
2812 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
2813
2814 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
2815 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
2816
2817 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
2818 Init =
2819 Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
2820
2821 // Create the widened phi of the vector IV.
2822 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
2823 WidenIVR->getDebugLoc(), "vec.ind");
2824 WidePHI->addOperand(Init);
2825 WidePHI->insertBefore(WidenIVR);
2826
2827 // Create the backedge value for the vector IV.
2828 VPValue *Inc;
2829 VPValue *Prev;
2830 // If unrolled, use the increment and prev value from the operands.
2831 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
2832 Inc = SplatVF;
2833 Prev = WidenIVR->getLastUnrolledPartOperand();
2834 } else {
2835 if (VPRecipeBase *R = VF->getDefiningRecipe())
2836 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
2837 // Multiply the vectorization factor by the step using integer or
2838 // floating-point arithmetic as appropriate.
2839 if (StepTy->isFloatingPointTy())
2840 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
2841 DL);
2842 else
2843 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
2844 TypeInfo.inferScalarType(VF), DL);
2845
2846 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
2847 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
2848 Prev = WidePHI;
2849 }
2850
2852 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
2853 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
2854 WidenIVR->getDebugLoc(), "vec.ind.next");
2855
2856 WidePHI->addOperand(Next);
2857
2858 WidenIVR->replaceAllUsesWith(WidePHI);
2859}
2860
2861/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
2862/// initial value, phi and backedge value. In the following example:
2863///
2864/// <x1> vector loop: {
2865/// vector.body:
2866/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
2867/// ...
2868/// EMIT branch-on-count ...
2869/// }
2870///
2871/// WIDEN-POINTER-INDUCTION will get expanded to:
2872///
2873/// <x1> vector loop: {
2874/// vector.body:
2875/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
2876/// EMIT %mul = mul %stepvector, %step
2877/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
2878/// ...
2879/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
2880/// EMIT branch-on-count ...
2881/// }
2883 VPTypeAnalysis &TypeInfo) {
2884 VPlan *Plan = R->getParent()->getPlan();
2885 VPValue *Start = R->getStartValue();
2886 VPValue *Step = R->getStepValue();
2887 VPValue *VF = R->getVFValue();
2888
2889 assert(R->getInductionDescriptor().getKind() ==
2891 "Not a pointer induction according to InductionDescriptor!");
2892 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
2893 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
2894 "Recipe should have been replaced");
2895
2896 VPBuilder Builder(R);
2897 DebugLoc DL = R->getDebugLoc();
2898
2899 // Build a scalar pointer phi.
2900 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
2901
2902 // Create actual address geps that use the pointer phi as base and a
2903 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
2904 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
2905 Type *StepTy = TypeInfo.inferScalarType(Step);
2906 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
2907 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
2908 VPValue *PtrAdd = Builder.createNaryOp(
2909 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
2910 R->replaceAllUsesWith(PtrAdd);
2911
2912 // Create the backedge value for the scalar pointer phi.
2914 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
2915 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
2916 DL);
2917 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
2918
2919 VPValue *InductionGEP =
2920 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
2921 ScalarPtrPhi->addOperand(InductionGEP);
2922}
2923
2925 // Replace loop regions with explicity CFG.
2926 SmallVector<VPRegionBlock *> LoopRegions;
2927 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2928 vp_depth_first_deep(Plan.getEntry()))) {
2929 if (!R->isReplicator())
2930 LoopRegions.push_back(R);
2931 }
2932 for (VPRegionBlock *R : LoopRegions)
2933 R->dissolveToCFGLoop();
2934}
2935
2937 VPTypeAnalysis TypeInfo(Plan);
2939 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2940 vp_depth_first_deep(Plan.getEntry()))) {
2941 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2942 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
2943 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
2944 ToRemove.push_back(WidenIVR);
2945 continue;
2946 }
2947
2948 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
2949 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
2950 ToRemove.push_back(WidenIVR);
2951 continue;
2952 }
2953
2954 // Expand VPBlendRecipe into VPInstruction::Select.
2955 VPBuilder Builder(&R);
2956 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
2957 VPValue *Select = Blend->getIncomingValue(0);
2958 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2959 Select = Builder.createSelect(Blend->getMask(I),
2960 Blend->getIncomingValue(I), Select,
2961 R.getDebugLoc(), "predphi");
2962 Blend->replaceAllUsesWith(Select);
2963 ToRemove.push_back(Blend);
2964 }
2965
2966 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
2967 Expr->decompose();
2968 ToRemove.push_back(Expr);
2969 }
2970
2971 VPValue *VectorStep;
2972 VPValue *ScalarStep;
2973 if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
2974 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
2975 continue;
2976
2977 // Expand WideIVStep.
2978 auto *VPI = cast<VPInstruction>(&R);
2979 Type *IVTy = TypeInfo.inferScalarType(VPI);
2980 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
2982 ? Instruction::UIToFP
2983 : Instruction::Trunc;
2984 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
2985 }
2986
2987 [[maybe_unused]] auto *ConstStep =
2988 ScalarStep->isLiveIn()
2989 ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
2990 : nullptr;
2991 assert(!ConstStep || ConstStep->getValue() != 1);
2992 (void)ConstStep;
2993 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
2994 ScalarStep =
2995 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
2996 }
2997
2998 VPIRFlags Flags;
2999 if (IVTy->isFloatingPointTy())
3000 Flags = {VPI->getFastMathFlags()};
3001
3002 unsigned MulOpc =
3003 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3004 VPInstruction *Mul = Builder.createNaryOp(
3005 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3006 VectorStep = Mul;
3007 VPI->replaceAllUsesWith(VectorStep);
3008 ToRemove.push_back(VPI);
3009 }
3010 }
3011
3012 for (VPRecipeBase *R : ToRemove)
3013 R->eraseFromParent();
3014}
3015
3017 VPBasicBlock *EarlyExitVPBB,
3018 VPlan &Plan,
3019 VPBasicBlock *HeaderVPBB,
3020 VPBasicBlock *LatchVPBB) {
3021 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3022 if (!EarlyExitVPBB->getSinglePredecessor() &&
3023 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3024 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3025 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3026 "unsupported early exit VPBB");
3027 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3028 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3029 // of the phis.
3030 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3031 cast<VPIRPhi>(&R)->swapOperands();
3032 }
3033
3034 VPBuilder Builder(LatchVPBB->getTerminator());
3035 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3036 assert(
3037 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
3038 "Terminator must be be BranchOnCond");
3039 VPValue *CondOfEarlyExitingVPBB =
3040 EarlyExitingVPBB->getTerminator()->getOperand(0);
3041 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3042 ? CondOfEarlyExitingVPBB
3043 : Builder.createNot(CondOfEarlyExitingVPBB);
3044
3045 // Split the middle block and have it conditionally branch to the early exit
3046 // block if CondToEarlyExit.
3047 VPValue *IsEarlyExitTaken =
3048 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3049 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3050 VPBasicBlock *VectorEarlyExitVPBB =
3051 Plan.createVPBasicBlock("vector.early.exit");
3052 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3053 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3054 NewMiddle->swapSuccessors();
3055
3056 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3057
3058 // Update the exit phis in the early exit block.
3059 VPBuilder MiddleBuilder(NewMiddle);
3060 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3061 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3062 auto *ExitIRI = cast<VPIRPhi>(&R);
3063 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3064 // a single predecessor and 1 if it has two.
3065 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3066 if (ExitIRI->getNumOperands() != 1) {
3067 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3068 // predecessor. Extract its last lane.
3069 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3070 }
3071
3072 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3073 if (!IncomingFromEarlyExit->isLiveIn()) {
3074 // Update the incoming value from the early exit.
3075 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3076 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3077 "first.active.lane");
3078 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3079 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3080 nullptr, "early.exit.value");
3081 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3082 }
3083 }
3084 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3085
3086 // Replace the condition controlling the non-early exit from the vector loop
3087 // with one exiting if either the original condition of the vector latch is
3088 // true or the early exit has been taken.
3089 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3090 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3091 "Unexpected terminator");
3092 auto *IsLatchExitTaken =
3093 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3094 LatchExitingBranch->getOperand(1));
3095 auto *AnyExitTaken = Builder.createNaryOp(
3096 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3097 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3098 LatchExitingBranch->eraseFromParent();
3099}
3100
3101/// This function tries convert extended in-loop reductions to
3102/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3103/// valid. The created recipe must be decomposed to its constituent
3104/// recipes before execution.
3105static VPExpressionRecipe *
3107 VFRange &Range) {
3108 Type *RedTy = Ctx.Types.inferScalarType(Red);
3109 VPValue *VecOp = Red->getVecOp();
3110
3111 // Clamp the range if using extended-reduction is profitable.
3112 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
3113 Type *SrcTy) -> bool {
3115 [&](ElementCount VF) {
3116 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3119 Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
3120 CostKind);
3121 InstructionCost ExtCost =
3122 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3123 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3124 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3125 },
3126 Range);
3127 };
3128
3129 VPValue *A;
3130 // Match reduce(ext)).
3131 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3132 IsExtendedRedValidAndClampRange(
3133 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3134 cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
3135 Instruction::CastOps::ZExt,
3136 Ctx.Types.inferScalarType(A)))
3137 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3138
3139 return nullptr;
3140}
3141
3142/// This function tries convert extended in-loop reductions to
3143/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3144/// and valid. The created VPExpressionRecipe must be decomposed to its
3145/// constituent recipes before execution. Patterns of the
3146/// VPExpressionRecipe:
3147/// reduce.add(mul(...)),
3148/// reduce.add(mul(ext(A), ext(B))),
3149/// reduce.add(ext(mul(ext(A), ext(B)))).
3150static VPExpressionRecipe *
3152 VPCostContext &Ctx, VFRange &Range) {
3153 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3154 if (Opcode != Instruction::Add)
3155 return nullptr;
3156
3157 Type *RedTy = Ctx.Types.inferScalarType(Red);
3158
3159 // Clamp the range if using multiply-accumulate-reduction is profitable.
3160 auto IsMulAccValidAndClampRange =
3161 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
3162 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
3164 [&](ElementCount VF) {
3166 Type *SrcTy =
3167 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3168 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3169 InstructionCost MulAccCost =
3170 Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
3171 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3172 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3173 InstructionCost ExtCost = 0;
3174 if (Ext0)
3175 ExtCost += Ext0->computeCost(VF, Ctx);
3176 if (Ext1)
3177 ExtCost += Ext1->computeCost(VF, Ctx);
3178 if (OuterExt)
3179 ExtCost += OuterExt->computeCost(VF, Ctx);
3180
3181 return MulAccCost.isValid() &&
3182 MulAccCost < ExtCost + MulCost + RedCost;
3183 },
3184 Range);
3185 };
3186
3187 VPValue *VecOp = Red->getVecOp();
3188 VPValue *A, *B;
3189 // Try to match reduce.add(mul(...)).
3190 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3191 auto *RecipeA =
3192 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3193 auto *RecipeB =
3194 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3195 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3196
3197 // Match reduce.add(mul(ext, ext)).
3198 if (RecipeA && RecipeB &&
3199 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
3200 match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3201 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3202 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
3203 Instruction::CastOps::ZExt,
3204 Mul, RecipeA, RecipeB, nullptr)) {
3205 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3206 }
3207 // Match reduce.add(mul).
3208 if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
3209 return new VPExpressionRecipe(Mul, Red);
3210 }
3211 // Match reduce.add(ext(mul(ext(A), ext(B)))).
3212 // All extend recipes must have same opcode or A == B
3213 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
3215 m_ZExtOrSExt(m_VPValue()))))) {
3216 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3217 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3218 auto *Ext0 =
3219 cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
3220 auto *Ext1 =
3221 cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
3222 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3223 Ext0->getOpcode() == Ext1->getOpcode() &&
3224 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
3225 Instruction::CastOps::ZExt,
3226 Mul, Ext0, Ext1, Ext)) {
3227 auto *NewExt0 = new VPWidenCastRecipe(
3228 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3229 Ext0->getDebugLoc());
3230 NewExt0->insertBefore(Ext0);
3231
3232 VPWidenCastRecipe *NewExt1 = NewExt0;
3233 if (Ext0 != Ext1) {
3234 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3235 Ext->getResultType(), *Ext1,
3236 Ext1->getDebugLoc());
3237 NewExt1->insertBefore(Ext1);
3238 }
3239 Mul->setOperand(0, NewExt0);
3240 Mul->setOperand(1, NewExt1);
3241 Red->setOperand(1, Mul);
3242 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3243 }
3244 }
3245 return nullptr;
3246}
3247
3248/// This function tries to create abstract recipes from the reduction recipe for
3249/// following optimizations and cost estimation.
3251 VPCostContext &Ctx,
3252 VFRange &Range) {
3253 VPExpressionRecipe *AbstractR = nullptr;
3254 auto IP = std::next(Red->getIterator());
3255 auto *VPBB = Red->getParent();
3256 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3257 AbstractR = MulAcc;
3258 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3259 AbstractR = ExtRed;
3260 // Cannot create abstract inloop reduction recipes.
3261 if (!AbstractR)
3262 return;
3263
3264 AbstractR->insertBefore(*VPBB, IP);
3265 Red->replaceAllUsesWith(AbstractR);
3266}
3267
3269 VFRange &Range) {
3270 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3272 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3273 if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
3275 }
3276 }
3277}
3278
3280 if (Plan.hasScalarVFOnly())
3281 return;
3282
3283#ifndef NDEBUG
3284 VPDominatorTree VPDT;
3285 VPDT.recalculate(Plan);
3286#endif
3287
3288 SmallVector<VPValue *> VPValues;
3291 append_range(VPValues, Plan.getLiveIns());
3292 for (VPRecipeBase &R : *Plan.getEntry())
3293 append_range(VPValues, R.definedValues());
3294
3295 auto *VectorPreheader = Plan.getVectorPreheader();
3296 for (VPValue *VPV : VPValues) {
3298 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3299 isa<Constant>(VPV->getLiveInIRValue())))
3300 continue;
3301
3302 // Add explicit broadcast at the insert point that dominates all users.
3303 VPBasicBlock *HoistBlock = VectorPreheader;
3304 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3305 for (VPUser *User : VPV->users()) {
3306 if (User->usesScalars(VPV))
3307 continue;
3308 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3309 HoistPoint = HoistBlock->begin();
3310 else
3311 assert(VPDT.dominates(VectorPreheader,
3312 cast<VPRecipeBase>(User)->getParent()) &&
3313 "All users must be in the vector preheader or dominated by it");
3314 }
3315
3316 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3317 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3318 VPV->replaceUsesWithIf(Broadcast,
3319 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3320 return Broadcast != &U && !U.usesScalars(VPV);
3321 });
3322 }
3323}
3324
3326 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3328 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3329 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3330
3331 VPValue *TC = Plan.getTripCount();
3332 // Skip cases for which the trip count may be non-trivial to materialize.
3333 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3334 // tail is required.
3335 if (!Plan.hasScalarTail() ||
3337 Plan.getScalarPreheader() ||
3338 !TC->isLiveIn())
3339 return;
3340
3341 // Materialize vector trip counts for constants early if it can simply
3342 // be computed as (Original TC / VF * UF) * VF * UF.
3343 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3344 // tail-folded loops.
3345 ScalarEvolution &SE = *PSE.getSE();
3346 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3347 if (!isa<SCEVConstant>(TCScev))
3348 return;
3349 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3350 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3351 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3352 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3353}
3354
3356 VPBasicBlock *VectorPH) {
3358 if (BTC->getNumUsers() == 0)
3359 return;
3360
3361 VPBuilder Builder(VectorPH, VectorPH->begin());
3362 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3363 auto *TCMO = Builder.createNaryOp(
3364 Instruction::Sub,
3365 {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
3366 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3367 BTC->replaceAllUsesWith(TCMO);
3368}
3369
3371 if (Plan.hasScalarVFOnly())
3372 return;
3373
3374 VPTypeAnalysis TypeInfo(Plan);
3375 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3376 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3378 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3379 vp_depth_first_shallow(LoopRegion->getEntry()));
3380 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
3381 // excluding ones in replicate regions. Those are not materialized explicitly
3382 // yet. Those vector users are still handled in VPReplicateRegion::execute(),
3383 // via shouldPack().
3384 // TODO: materialize build vectors for replicating recipes in replicating
3385 // regions.
3386 // TODO: materialize build vectors for VPInstructions.
3387 for (VPBasicBlock *VPBB :
3388 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3389 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3390 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3391 auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
3392 VPRegionBlock *ParentRegion =
3393 cast<VPRecipeBase>(U)->getParent()->getParent();
3394 return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
3395 };
3396 if (!RepR || RepR->isSingleScalar() ||
3397 none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
3398 continue;
3399
3400 Type *ScalarTy = TypeInfo.inferScalarType(RepR);
3401 unsigned Opcode = ScalarTy->isStructTy()
3404 auto *BuildVector = new VPInstruction(Opcode, {RepR});
3405 BuildVector->insertAfter(RepR);
3406
3407 RepR->replaceUsesWithIf(
3408 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3409 VPUser &U, unsigned) {
3410 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3411 });
3412 }
3413 }
3414}
3415
3417 VPBasicBlock *VectorPHVPBB,
3418 bool TailByMasking,
3419 bool RequiresScalarEpilogue) {
3420 VPValue &VectorTC = Plan.getVectorTripCount();
3421 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3422 // There's nothing to do if there are no users of the vector trip count or its
3423 // IR value has already been set.
3424 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3425 return;
3426
3427 VPValue *TC = Plan.getTripCount();
3428 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3429 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3430 VPValue *Step = &Plan.getVFxUF();
3431
3432 // If the tail is to be folded by masking, round the number of iterations N
3433 // up to a multiple of Step instead of rounding down. This is done by first
3434 // adding Step-1 and then rounding down. Note that it's ok if this addition
3435 // overflows: the vector induction variable will eventually wrap to zero given
3436 // that it starts at zero and its Step is a power of two; the loop will then
3437 // exit, with the last early-exit vector comparison also producing all-true.
3438 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3439 // is accounted for in emitIterationCountCheck that adds an overflow check.
3440 if (TailByMasking) {
3441 TC = Builder.createNaryOp(
3442 Instruction::Add,
3443 {TC, Builder.createNaryOp(
3444 Instruction::Sub,
3445 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3446 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3447 }
3448
3449 // Now we need to generate the expression for the part of the loop that the
3450 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3451 // iterations are not required for correctness, or N - Step, otherwise. Step
3452 // is equal to the vectorization factor (number of SIMD elements) times the
3453 // unroll factor (number of SIMD instructions).
3454 VPValue *R =
3455 Builder.createNaryOp(Instruction::URem, {TC, Step},
3456 DebugLoc::getCompilerGenerated(), "n.mod.vf");
3457
3458 // There are cases where we *must* run at least one iteration in the remainder
3459 // loop. See the cost model for when this can happen. If the step evenly
3460 // divides the trip count, we set the remainder to be equal to the step. If
3461 // the step does not evenly divide the trip count, no adjustment is necessary
3462 // since there will already be scalar iterations. Note that the minimum
3463 // iterations check ensures that N >= Step.
3464 if (RequiresScalarEpilogue) {
3465 assert(!TailByMasking &&
3466 "requiring scalar epilogue is not supported with fail folding");
3467 VPValue *IsZero = Builder.createICmp(
3468 CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
3469 R = Builder.createSelect(IsZero, Step, R);
3470 }
3471
3472 VPValue *Res = Builder.createNaryOp(
3473 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
3474 VectorTC.replaceAllUsesWith(Res);
3475}
3476
3478 ElementCount VFEC) {
3479 VPBuilder Builder(VectorPH, VectorPH->begin());
3480 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3481 VPValue &VF = Plan.getVF();
3482 VPValue &VFxUF = Plan.getVFxUF();
3483 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
3484 // used.
3485 // TODO: Assert that they aren't used.
3486
3487 // If there are no users of the runtime VF, compute VFxUF by constant folding
3488 // the multiplication of VF and UF.
3489 if (VF.getNumUsers() == 0) {
3490 VPValue *RuntimeVFxUF =
3491 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
3492 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
3493 return;
3494 }
3495
3496 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
3497 // vscale) * UF.
3498 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
3500 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
3502 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
3503 }
3504 VF.replaceAllUsesWith(RuntimeVF);
3505
3506 VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
3507 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
3508 VFxUF.replaceAllUsesWith(MulByUF);
3509}
3510
3513 const DataLayout &DL = SE.getDataLayout();
3514 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
3515
3516 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
3517 BasicBlock *EntryBB = Entry->getIRBasicBlock();
3518 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
3519 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
3520 if (isa<VPIRInstruction, VPIRPhi>(&R))
3521 continue;
3522 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3523 if (!ExpSCEV)
3524 break;
3525 const SCEV *Expr = ExpSCEV->getSCEV();
3526 Value *Res =
3527 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
3528 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
3529 VPValue *Exp = Plan.getOrAddLiveIn(Res);
3530 ExpSCEV->replaceAllUsesWith(Exp);
3531 if (Plan.getTripCount() == ExpSCEV)
3532 Plan.resetTripCount(Exp);
3533 ExpSCEV->eraseFromParent();
3534 }
3535 assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
3536 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
3537 "after any VPIRInstructions");
3538 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
3539 // to the VPIRBasicBlock.
3540 auto EI = Entry->begin();
3541 for (Instruction &I : drop_end(*EntryBB)) {
3542 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
3543 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
3544 EI++;
3545 continue;
3546 }
3548 }
3549
3550 return ExpandedSCEVs;
3551}
3552
3553/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3554/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
3555/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
3556/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
3557/// an index-independent load if it feeds all wide ops at all indices (\p OpV
3558/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
3559/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3560/// is defined at \p Idx of a load interleave group.
3561static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
3562 VPValue *OpV, unsigned Idx) {
3563 auto *DefR = OpV->getDefiningRecipe();
3564 if (!DefR)
3565 return WideMember0->getOperand(OpIdx) == OpV;
3566 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
3567 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
3568
3569 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
3570 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
3571 return false;
3572}
3573
3574/// Returns true if \p IR is a full interleave group with factor and number of
3575/// members both equal to \p VF. The interleave group must also access the full
3576/// vector width \p VectorRegWidth.
3578 unsigned VF, VPTypeAnalysis &TypeInfo,
3579 unsigned VectorRegWidth) {
3580 if (!InterleaveR)
3581 return false;
3582
3583 Type *GroupElementTy = nullptr;
3584 if (InterleaveR->getStoredValues().empty()) {
3585 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
3586 if (!all_of(InterleaveR->definedValues(),
3587 [&TypeInfo, GroupElementTy](VPValue *Op) {
3588 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3589 }))
3590 return false;
3591 } else {
3592 GroupElementTy =
3593 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
3594 if (!all_of(InterleaveR->getStoredValues(),
3595 [&TypeInfo, GroupElementTy](VPValue *Op) {
3596 return TypeInfo.inferScalarType(Op) == GroupElementTy;
3597 }))
3598 return false;
3599 }
3600
3601 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3602 auto IG = InterleaveR->getInterleaveGroup();
3603 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3604 GroupSize == VectorRegWidth;
3605}
3606
3607/// Returns true if \p VPValue is a narrow VPValue.
3608static bool isAlreadyNarrow(VPValue *VPV) {
3609 if (VPV->isLiveIn())
3610 return true;
3611 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
3612 return RepR && RepR->isSingleScalar();
3613}
3614
3616 unsigned VectorRegWidth) {
3617 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3618 if (VF.isScalable() || !VectorLoop)
3619 return;
3620
3621 VPTypeAnalysis TypeInfo(Plan);
3622
3623 unsigned FixedVF = VF.getFixedValue();
3625 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3626 if (isa<VPCanonicalIVPHIRecipe>(&R) ||
3628 continue;
3629
3630 if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
3631 vputils::onlyFirstLaneUsed(cast<VPSingleDefRecipe>(&R)))
3632 continue;
3633
3634 // Bail out on recipes not supported at the moment:
3635 // * phi recipes other than the canonical induction
3636 // * recipes writing to memory except interleave groups
3637 // Only support plans with a canonical induction phi.
3638 if (R.isPhi())
3639 return;
3640
3641 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
3642 if (R.mayWriteToMemory() && !InterleaveR)
3643 return;
3644
3645 // All other ops are allowed, but we reject uses that cannot be converted
3646 // when checking all allowed consumers (store interleave groups) below.
3647 if (!InterleaveR)
3648 continue;
3649
3650 // Bail out on non-consecutive interleave groups.
3651 if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
3652 VectorRegWidth))
3653 return;
3654
3655 // Skip read interleave groups.
3656 if (InterleaveR->getStoredValues().empty())
3657 continue;
3658
3659 // Narrow interleave groups, if all operands are already matching narrow
3660 // ops.
3661 auto *Member0 = InterleaveR->getStoredValues()[0];
3662 if (isAlreadyNarrow(Member0) &&
3663 all_of(InterleaveR->getStoredValues(),
3664 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
3665 StoreGroups.push_back(InterleaveR);
3666 continue;
3667 }
3668
3669 // For now, we only support full interleave groups storing load interleave
3670 // groups.
3671 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
3672 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3673 if (!DefR)
3674 return false;
3675 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
3676 return IR && IR->getInterleaveGroup()->isFull() &&
3677 IR->getVPValue(Op.index()) == Op.value();
3678 })) {
3679 StoreGroups.push_back(InterleaveR);
3680 continue;
3681 }
3682
3683 // Check if all values feeding InterleaveR are matching wide recipes, which
3684 // operands that can be narrowed.
3685 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3686 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
3687 if (!WideMember0)
3688 return;
3689 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
3690 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
3691 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
3692 R->getNumOperands() > 2)
3693 return;
3694 if (any_of(enumerate(R->operands()),
3695 [WideMember0, Idx = I](const auto &P) {
3696 const auto &[OpIdx, OpV] = P;
3697 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
3698 }))
3699 return;
3700 }
3701 StoreGroups.push_back(InterleaveR);
3702 }
3703
3704 if (StoreGroups.empty())
3705 return;
3706
3707 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3708 auto NarrowOp = [](VPValue *V) -> VPValue * {
3709 auto *R = V->getDefiningRecipe();
3710 if (!R)
3711 return V;
3712 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
3713 // Narrow interleave group to wide load, as transformed VPlan will only
3714 // process one original iteration.
3715 auto *L = new VPWidenLoadRecipe(
3716 *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
3717 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
3718 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
3719 L->insertBefore(LoadGroup);
3720 return L;
3721 }
3722
3723 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
3724 assert(RepR->isSingleScalar() &&
3725 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
3726 "must be a single scalar load");
3727 return RepR;
3728 }
3729 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
3730
3731 VPValue *PtrOp = WideLoad->getAddr();
3732 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
3733 PtrOp = VecPtr->getOperand(0);
3734 // Narrow wide load to uniform scalar load, as transformed VPlan will only
3735 // process one original iteration.
3736 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
3737 /*IsUniform*/ true,
3738 /*Mask*/ nullptr, *WideLoad);
3739 N->insertBefore(WideLoad);
3740 return N;
3741 };
3742
3743 // Narrow operation tree rooted at store groups.
3744 for (auto *StoreGroup : StoreGroups) {
3745 VPValue *Res = nullptr;
3746 VPValue *Member0 = StoreGroup->getStoredValues()[0];
3747 if (isAlreadyNarrow(Member0)) {
3748 Res = Member0;
3749 } else if (auto *WideMember0 =
3750 dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {
3751 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
3752 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
3753 Res = WideMember0;
3754 } else {
3755 Res = NarrowOp(Member0);
3756 }
3757
3758 auto *S = new VPWidenStoreRecipe(
3759 *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
3760 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
3761 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
3762 S->insertBefore(StoreGroup);
3763 StoreGroup->eraseFromParent();
3764 }
3765
3766 // Adjust induction to reflect that the transformed plan only processes one
3767 // original iteration.
3768 auto *CanIV = Plan.getCanonicalIV();
3769 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
3770 Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
3771 CanIV->getScalarType(), 1 * Plan.getUF())));
3773 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
3774 removeDeadRecipes(Plan);
3775 assert(none_of(*VectorLoop->getEntryBasicBlock(),
3776 IsaPred<VPVectorPointerRecipe>) &&
3777 "All VPVectorPointerRecipes should have been removed");
3778}
3779
3780/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
3781/// BranchOnCond recipe.
3783 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
3784 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
3785 auto *MiddleTerm =
3786 dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
3787 // Only add branch metadata if there is a (conditional) terminator.
3788 if (!MiddleTerm)
3789 return;
3790
3791 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
3792 "must have a BranchOnCond");
3793 // Assume that `TripCount % VectorStep ` is equally distributed.
3794 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
3795 if (VF.isScalable() && VScaleForTuning.has_value())
3796 VectorStep *= *VScaleForTuning;
3797 assert(VectorStep > 0 && "trip count should not be zero");
3798 MDBuilder MDB(Plan.getContext());
3799 MDNode *BranchWeights =
3800 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
3801 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
3802}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Addr
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
iv Induction Variable Users
Definition: IVUsers.cpp:48
licm
Definition: LICM.cpp:381
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static Value * tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:83
Class for arbitrary precision integers.
Definition: APInt.h:78
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition: DebugLoc.h:163
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
Value * FoldGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, GEPNoWrapFlags NW) const override
Value * FoldBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS) const override
Value * FoldSelect(Value *C, Value *True, Value *False) const override
Value * FoldCast(Instruction::CastOps Op, Value *V, Type *DestTy) const override
Value * FoldCmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override
bool isCast() const
Definition: Instruction.h:321
bool isBinaryOp() const
Definition: Instruction.h:317
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:86
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:524
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:594
uint32_t getNumMembers() const
Definition: VectorUtils.h:542
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1567
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition: MDBuilder.cpp:38
Metadata node.
Definition: Metadata.h:1077
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:99
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
unsigned getOpcode() const
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
iterator begin() const
Definition: SmallPtrSet.h:494
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
Provides information about what library functions are available for the current target.
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
LLVM_ABI InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:261
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
op_range operands()
Definition: User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition: VPlan.h:3352
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3639
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3714
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3666
iterator end()
Definition: VPlan.h:3676
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3674
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3727
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:236
VPRegionBlock * getEnclosingLoopRegion()
Definition: VPlan.cpp:622
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition: VPlan.cpp:594
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition: VPlan.cpp:667
const VPRecipeBase & back() const
Definition: VPlan.h:3688
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2374
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:81
VPRegionBlock * getParent()
Definition: VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:180
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:319
size_t getNumPredecessors() const
Definition: VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition: VPlan.h:204
VPlan * getPlan()
Definition: VPlan.cpp:155
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:160
VPBlockBase * getSingleHierarchicalPredecessor()
Definition: VPlan.h:261
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:198
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlanUtils.h:237
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition: VPlanUtils.h:156
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:175
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:194
A recipe for generating conditional branches on the bits of a mask.
Definition: VPlan.h:2809
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
VPScalarIVStepsRecipe * createScalarIVSteps(Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step, VPValue *VF, DebugLoc DL)
VPInstruction * createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:3295
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:3322
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition: VPlanValue.h:421
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:416
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:394
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:406
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition: VPlan.h:3383
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition: VPlan.h:2854
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1999
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1988
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3792
Class to record and manage LLVM IR flags.
Definition: VPlan.h:596
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition: VPlan.h:926
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:967
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition: VPlan.h:1040
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:973
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition: VPlan.h:993
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition: VPlan.h:990
@ CanonicalIVIncrementForPart
Definition: VPlan.h:983
@ CalculateTripCountMinusVF
Definition: VPlan.h:981
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2443
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2500
const InterleaveGroup< Instruction > * getInterleaveGroup()
Definition: VPlan.h:2520
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition: VPlan.h:2966
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:391
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition: VPlan.h:474
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPBasicBlock * getParent()
Definition: VPlan.h:412
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition: VPlan.h:2687
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2541
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3827
const VPBlockBase * getEntry() const
Definition: VPlan.h:3863
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition: VPlan.h:3880
const VPBlockBase * getExiting() const
Definition: VPlan.h:3875
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition: VPlan.h:3888
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2731
bool isSingleScalar() const
Definition: VPlan.h:2776
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition: VPlan.h:2800
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition: VPlan.h:3529
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:518
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:582
virtual VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:43
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Definition: VPlanAnalysis.h:67
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:197
operand_range operands()
Definition: VPlanValue.h:265
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:241
operand_iterator op_end()
Definition: VPlanValue.h:263
operand_iterator op_begin()
Definition: VPlanValue.h:261
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:236
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:230
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:125
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:174
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition: VPlanValue.h:184
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1403
unsigned getNumUsers() const
Definition: VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:169
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1407
user_range users()
Definition: VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition: VPlan.h:1817
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition: VPlan.h:1861
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3424
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1467
A recipe for handling GEP instructions.
Definition: VPlan.h:1753
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:2013
PHINode * getPHINode() const
Definition: VPlan.h:2055
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:2041
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:2058
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:2088
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition: VPlan.h:2169
A recipe for widening vector intrinsics.
Definition: VPlan.h:1524
A common base class for widening memory operations.
Definition: VPlan.h:3008
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:3070
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:3063
A recipe for widened phis.
Definition: VPlan.h:2224
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1424
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3930
bool hasVF(ElementCount VF) const
Definition: VPlan.h:4139
LLVMContext & getContext() const
Definition: VPlan.h:4127
VPBasicBlock * getEntry()
Definition: VPlan.h:4029
VPRegionBlock * createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="", bool IsReplicator=false)
Create a new VPRegionBlock with Entry, Exiting and Name.
Definition: VPlan.h:4270
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:4119
bool hasScalableVF() const
Definition: VPlan.h:4140
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:4125
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:4122
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:4091
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition: VPlan.h:4196
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:4112
unsigned getUF() const
Definition: VPlan.h:4159
bool hasUF(unsigned UF) const
Definition: VPlan.h:4157
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition: VPlan.h:4081
void setVF(ElementCount VF)
Definition: VPlan.h:4133
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition: VPlan.h:4172
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1037
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:4105
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:4054
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition: VPlan.h:4260
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:4181
bool hasScalarVFOnly() const
Definition: VPlan.h:4150
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:4072
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition: VPlan.h:4211
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:4235
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:4077
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition: VPlan.h:4034
void setUF(unsigned UF)
Definition: VPlan.h:4164
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition: VPlan.h:4313
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:390
iterator_range< user_iterator > users()
Definition: Value.h:426
bool hasName() const
Definition: Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:259
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:175
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:34
self_iterator getIterator()
Definition: ilist_node.h:134
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition: APInt.cpp:2763
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:980
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount, Op0_t, Op1_t > m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::BranchOnCond, Op0_t > m_BranchOnCond(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition: VPlanUtils.h:44
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:32
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:79
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
Definition: VPlanUtils.cpp:17
bool isHeaderMask(const VPValue *V, VPlan &Plan)
Return true if V is a header mask in Plan.
Definition: VPlanUtils.cpp:55
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
Definition: VPlanUtils.cpp:27
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:243
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition: VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:77
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1300
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:345
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition: Casting.h:836
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlanHelpers.h:71
Struct to hold various analysis needed for cost computations.
Definition: VPlanHelpers.h:344
VPTypeAnalysis Types
Definition: VPlanHelpers.h:347
const TargetTransformInfo & TTI
Definition: VPlanHelpers.h:345
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2267
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition: VPlan.h:3130
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:3090
A recipe for widening select instructions.
Definition: VPlan.h:1707
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition: VPlan.h:3212
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:3170
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.