LLVM: lib/Transforms/Vectorize/VPlanTransforms.cpp Source File

//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

///

/// \file

/// This file implements a set of utility VPlan to VPlan transformations.

///

//===----------------------------------------------------------------------===//


#include "VPlanTransforms.h"

#include "VPRecipeBuilder.h"

#include "VPlan.h"

#include "VPlanAnalysis.h"

#include "VPlanCFG.h"

#include "VPlanDominatorTree.h"

#include "VPlanHelpers.h"

#include "VPlanPatternMatch.h"

#include "VPlanUtils.h"

#include "VPlanVerifier.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/TypeSwitch.h"

#include "llvm/Analysis/IVDescriptors.h"

#include "llvm/Analysis/InstSimplifyFolder.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/TypeSize.h"

#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"


using namespace llvm;

using namespace VPlanPatternMatch;


bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(

    VPlanPtr &Plan,

    function_ref<const InductionDescriptor *(PHINode *)>

        GetIntOrFpInductionDescriptor,

    const TargetLibraryInfo &TLI) {


  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(

      Plan->getVectorLoopRegion());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {

    // Skip blocks outside region

    if (!VPBB->getParent())

      break;

    VPRecipeBase *Term = VPBB->getTerminator();

    auto EndIter = Term ? Term->getIterator() : VPBB->end();

    // Introduce each ingredient into VPlan.

    for (VPRecipeBase &Ingredient :

         make_early_inc_range(make_range(VPBB->begin(), EndIter))) {


      VPValue *VPV = Ingredient.getVPSingleValue();

      if (!VPV->getUnderlyingValue())

        continue;


      Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());


      VPRecipeBase *NewRecipe = nullptr;

      if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {

        auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());

        const auto *II = GetIntOrFpInductionDescriptor(Phi);

        if (!II) {

          NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());

          for (VPValue *Op : PhiR->operands())

            NewRecipe->addOperand(Op);

        } else {

          VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());

          VPValue *Step =

              vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep());

          NewRecipe = new VPWidenIntOrFpInductionRecipe(

              Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());

        }

      } else {

        assert(isa<VPInstruction>(&Ingredient) &&

               "only VPInstructions expected here");

        assert(!isa<PHINode>(Inst) && "phis should be handled above");

        // Create VPWidenMemoryRecipe for loads and stores.

        if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {

          NewRecipe = new VPWidenLoadRecipe(

              *Load, Ingredient.getOperand(0), nullptr /*Mask*/,

              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),

              Ingredient.getDebugLoc());

        } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {

          NewRecipe = new VPWidenStoreRecipe(

              *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),

              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,

              VPIRMetadata(*Store), Ingredient.getDebugLoc());

        } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {

          NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());

        } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {

          Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);

          if (VectorID == Intrinsic::not_intrinsic)

            return false;

          NewRecipe = new VPWidenIntrinsicRecipe(

              *CI, getVectorIntrinsicIDForCall(CI, &TLI),

              {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),

              CI->getDebugLoc());

        } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {

          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());

        } else if (auto *CI = dyn_cast<CastInst>(Inst)) {

          NewRecipe = new VPWidenCastRecipe(

              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);

        } else {

          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());

        }

      }


      NewRecipe->insertBefore(&Ingredient);

      if (NewRecipe->getNumDefinedValues() == 1)

        VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());

      else

        assert(NewRecipe->getNumDefinedValues() == 0 &&

               "Only recpies with zero or one defined values expected");

      Ingredient.eraseFromParent();

    }

  }

  return true;

}


static bool sinkScalarOperands(VPlan &Plan) {

  auto Iter = vp_depth_first_deep(Plan.getEntry());

  bool Changed = false;

  // First, collect the operands of all recipes in replicate blocks as seeds for

  // sinking.

  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;

  for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {

    VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();

    if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)

      continue;

    VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);

    if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())

      continue;

    for (auto &Recipe : *VPBB) {

      for (VPValue *Op : Recipe.operands())

        if (auto *Def =

                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))

          WorkList.insert({VPBB, Def});

    }

  }


  bool ScalarVFOnly = Plan.hasScalarVFOnly();

  // Try to sink each replicate or scalar IV steps recipe in the worklist.

  for (unsigned I = 0; I != WorkList.size(); ++I) {

    VPBasicBlock *SinkTo;

    VPSingleDefRecipe *SinkCandidate;

    std::tie(SinkTo, SinkCandidate) = WorkList[I];

    if (SinkCandidate->getParent() == SinkTo ||

        SinkCandidate->mayHaveSideEffects() ||

        SinkCandidate->mayReadOrWriteMemory())

      continue;

    if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {

      if (!ScalarVFOnly && RepR->isSingleScalar())

        continue;

    } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))

      continue;


    bool NeedsDuplicating = false;

    // All recipe users of the sink candidate must be in the same block SinkTo

    // or all users outside of SinkTo must be uniform-after-vectorization (

    // i.e., only first lane is used) . In the latter case, we need to duplicate

    // SinkCandidate.

    auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,

                            SinkCandidate](VPUser *U) {

      auto *UI = cast<VPRecipeBase>(U);

      if (UI->getParent() == SinkTo)

        return true;

      NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);

      // We only know how to duplicate VPReplicateRecipes and

      // VPScalarIVStepsRecipes for now.

      return NeedsDuplicating &&

             isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate);

    };

    if (!all_of(SinkCandidate->users(), CanSinkWithUser))

      continue;


    if (NeedsDuplicating) {

      if (ScalarVFOnly)

        continue;

      VPSingleDefRecipe *Clone;

      if (auto *SinkCandidateRepR =

              dyn_cast<VPReplicateRecipe>(SinkCandidate)) {

        // TODO: Handle converting to uniform recipes as separate transform,

        // then cloning should be sufficient here.

        Instruction *I = SinkCandidate->getUnderlyingInstr();

        Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,

                                      nullptr /*Mask*/, *SinkCandidateRepR);

        // TODO: add ".cloned" suffix to name of Clone's VPValue.

      } else {

        Clone = SinkCandidate->clone();

      }


      Clone->insertBefore(SinkCandidate);

      SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {

        return cast<VPRecipeBase>(&U)->getParent() != SinkTo;

      });

    }

    SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());

    for (VPValue *Op : SinkCandidate->operands())

      if (auto *Def =

              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))

        WorkList.insert({SinkTo, Def});

    Changed = true;

  }

  return Changed;

}


/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return

/// the mask.

VPValue *getPredicatedMask(VPRegionBlock *R) {

  auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());

  if (!EntryBB || EntryBB->size() != 1 ||

      !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))

    return nullptr;


  return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);

}


/// If \p R is a triangle region, return the 'then' block of the triangle.

static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {

  auto *EntryBB = cast<VPBasicBlock>(R->getEntry());

  if (EntryBB->getNumSuccessors() != 2)

    return nullptr;


  auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);

  auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);

  if (!Succ0 || !Succ1)

    return nullptr;


  if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)

    return nullptr;

  if (Succ0->getSingleSuccessor() == Succ1)

    return Succ0;

  if (Succ1->getSingleSuccessor() == Succ0)

    return Succ1;

  return nullptr;

}


// Merge replicate regions in their successor region, if a replicate region

// is connected to a successor replicate region with the same predicate by a

// single, empty VPBasicBlock.

static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {

  SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;


  // Collect replicate regions followed by an empty block, followed by another

  // replicate region with matching masks to process front. This is to avoid

  // iterator invalidation issues while merging regions.

  SmallVector<VPRegionBlock *, 8> WorkList;

  for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    if (!Region1->isReplicator())

      continue;

    auto *MiddleBasicBlock =

        dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());

    if (!MiddleBasicBlock || !MiddleBasicBlock->empty())

      continue;


    auto *Region2 =

        dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());

    if (!Region2 || !Region2->isReplicator())

      continue;


    VPValue *Mask1 = getPredicatedMask(Region1);

    VPValue *Mask2 = getPredicatedMask(Region2);

    if (!Mask1 || Mask1 != Mask2)

      continue;


    assert(Mask1 && Mask2 && "both region must have conditions");

    WorkList.push_back(Region1);

  }


  // Move recipes from Region1 to its successor region, if both are triangles.

  for (VPRegionBlock *Region1 : WorkList) {

    if (TransformedRegions.contains(Region1))

      continue;

    auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());

    auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());


    VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);

    VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);

    if (!Then1 || !Then2)

      continue;


    // Note: No fusion-preventing memory dependencies are expected in either

    // region. Such dependencies should be rejected during earlier dependence

    // checks, which guarantee accesses can be re-ordered for vectorization.

    //

    // Move recipes to the successor region.

    for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))

      ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());


    auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());

    auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());


    // Move VPPredInstPHIRecipes from the merge block to the successor region's

    // merge block. Update all users inside the successor region to use the

    // original values.

    for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {

      VPValue *PredInst1 =

          cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);

      VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();

      Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {

        return cast<VPRecipeBase>(&U)->getParent() == Then2;

      });


      // Remove phi recipes that are unused after merging the regions.

      if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {

        Phi1ToMove.eraseFromParent();

        continue;

      }

      Phi1ToMove.moveBefore(*Merge2, Merge2->begin());

    }


    // Remove the dead recipes in Region1's entry block.

    for (VPRecipeBase &R :

         make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))

      R.eraseFromParent();


    // Finally, remove the first region.

    for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {

      VPBlockUtils::disconnectBlocks(Pred, Region1);

      VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);

    }

    VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);

    TransformedRegions.insert(Region1);

  }


  return !TransformedRegions.empty();

}


static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,

                                            VPlan &Plan) {

  Instruction *Instr = PredRecipe->getUnderlyingInstr();

  // Build the triangular if-then region.

  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();

  assert(Instr->getParent() && "Predicated instruction not in any basic block");

  auto *BlockInMask = PredRecipe->getMask();

  auto *MaskDef = BlockInMask->getDefiningRecipe();

  auto *BOMRecipe = new VPBranchOnMaskRecipe(

      BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());

  auto *Entry =

      Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);


  // Replace predicated replicate recipe with a replicate recipe without a

  // mask but in the replicate region.

  auto *RecipeWithoutMask = new VPReplicateRecipe(

      PredRecipe->getUnderlyingInstr(),

      make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),

      PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);

  auto *Pred =

      Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);


  VPPredInstPHIRecipe *PHIRecipe = nullptr;

  if (PredRecipe->getNumUsers() != 0) {

    PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,

                                        RecipeWithoutMask->getDebugLoc());

    PredRecipe->replaceAllUsesWith(PHIRecipe);

    PHIRecipe->setOperand(0, RecipeWithoutMask);

  }

  PredRecipe->eraseFromParent();

  auto *Exiting =

      Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);

  VPRegionBlock *Region =

      Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);


  // Note: first set Entry as region entry and then connect successors starting

  // from it in order, to propagate the "parent" of each VPBasicBlock.

  VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);

  VPBlockUtils::connectBlocks(Pred, Exiting);


  return Region;

}


static void addReplicateRegions(VPlan &Plan) {

  SmallVector<VPReplicateRecipe *> WorkList;

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    for (VPRecipeBase &R : *VPBB)

      if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {

        if (RepR->isPredicated())

          WorkList.push_back(RepR);

      }

  }


  unsigned BBNum = 0;

  for (VPReplicateRecipe *RepR : WorkList) {

    VPBasicBlock *CurrentBlock = RepR->getParent();

    VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());


    BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();

    SplitBlock->setName(

        OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");

    // Record predicated instructions for above packing optimizations.

    VPRegionBlock *Region = createReplicateRegion(RepR, Plan);

    Region->setParent(CurrentBlock->getParent());

    VPBlockUtils::insertOnEdge(CurrentBlock, SplitBlock, Region);


    VPRegionBlock *ParentRegion = Region->getParent();

    if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)

      ParentRegion->setExiting(SplitBlock);

  }

}


/// Remove redundant VPBasicBlocks by merging them into their predecessor if

/// the predecessor has a single successor.

static bool mergeBlocksIntoPredecessors(VPlan &Plan) {

  SmallVector<VPBasicBlock *> WorkList;

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    // Don't fold the blocks in the skeleton of the Plan into their single

    // predecessors for now.

    // TODO: Remove restriction once more of the skeleton is modeled in VPlan.

    if (!VPBB->getParent())

      continue;

    auto *PredVPBB =

        dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());

    if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||

        isa<VPIRBasicBlock>(PredVPBB))

      continue;

    WorkList.push_back(VPBB);

  }


  for (VPBasicBlock *VPBB : WorkList) {

    VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());

    for (VPRecipeBase &R : make_early_inc_range(*VPBB))

      R.moveBefore(*PredVPBB, PredVPBB->end());

    VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);

    auto *ParentRegion = VPBB->getParent();

    if (ParentRegion && ParentRegion->getExiting() == VPBB)

      ParentRegion->setExiting(PredVPBB);

    for (auto *Succ : to_vector(VPBB->successors())) {

      VPBlockUtils::disconnectBlocks(VPBB, Succ);

      VPBlockUtils::connectBlocks(PredVPBB, Succ);

    }

    // VPBB is now dead and will be cleaned up when the plan gets destroyed.

  }

  return !WorkList.empty();

}


void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {

  // Convert masked VPReplicateRecipes to if-then region blocks.

  addReplicateRegions(Plan);


  bool ShouldSimplify = true;

  while (ShouldSimplify) {

    ShouldSimplify = sinkScalarOperands(Plan);

    ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);

    ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);

  }

}


/// Remove redundant casts of inductions.

///

/// Such redundant casts are casts of induction variables that can be ignored,

/// because we already proved that the casted phi is equal to the uncasted phi

/// in the vectorized loop. There is no need to vectorize the cast - the same

/// value can be used for both the phi and casts in the vector loop.

static void removeRedundantInductionCasts(VPlan &Plan) {

  for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

    auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);

    if (!IV || IV->getTruncInst())

      continue;


    // A sequence of IR Casts has potentially been recorded for IV, which

    // *must be bypassed* when the IV is vectorized, because the vectorized IV

    // will produce the desired casted value. This sequence forms a def-use

    // chain and is provided in reverse order, ending with the cast that uses

    // the IV phi. Search for the recipe of the last cast in the chain and

    // replace it with the original IV. Note that only the final cast is

    // expected to have users outside the cast-chain and the dead casts left

    // over will be cleaned up later.

    auto &Casts = IV->getInductionDescriptor().getCastInsts();

    VPValue *FindMyCast = IV;

    for (Instruction *IRCast : reverse(Casts)) {

      VPSingleDefRecipe *FoundUserCast = nullptr;

      for (auto *U : FindMyCast->users()) {

        auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);

        if (UserCast && UserCast->getUnderlyingValue() == IRCast) {

          FoundUserCast = UserCast;

          break;

        }

      }

      FindMyCast = FoundUserCast;

    }

    FindMyCast->replaceAllUsesWith(IV);

  }

}


/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV

/// recipe, if it exists.

static void removeRedundantCanonicalIVs(VPlan &Plan) {

  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();

  VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;

  for (VPUser *U : CanonicalIV->users()) {

    WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);

    if (WidenNewIV)

      break;

  }


  if (!WidenNewIV)

    return;


  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {

    auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);


    if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())

      continue;


    // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides

    // everything WidenNewIV's users need. That is, WidenOriginalIV will

    // generate a vector phi or all users of WidenNewIV demand the first lane

    // only.

    if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||

        vputils::onlyFirstLaneUsed(WidenNewIV)) {

      WidenNewIV->replaceAllUsesWith(WidenOriginalIV);

      WidenNewIV->eraseFromParent();

      return;

    }

  }

}


/// Returns true if \p R is dead and can be removed.

static bool isDeadRecipe(VPRecipeBase &R) {

  // Do remove conditional assume instructions as their conditions may be

  // flattened.

  auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

  bool IsConditionalAssume = RepR && RepR->isPredicated() &&

                             match(RepR, m_Intrinsic<Intrinsic::assume>());

  if (IsConditionalAssume)

    return true;


  if (R.mayHaveSideEffects())

    return false;


  // Recipe is dead if no user keeps the recipe alive.

  return all_of(R.definedValues(),

                [](VPValue *V) { return V->getNumUsers() == 0; });

}


void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_post_order_deep(Plan.getEntry()))) {

    // The recipes in the block are processed in reverse order, to catch chains

    // of dead recipes.

    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {

      if (isDeadRecipe(R)) {

        R.eraseFromParent();

        continue;

      }


      // Check if R is a dead VPPhi <-> update cycle and remove it.

      auto *PhiR = dyn_cast<VPPhi>(&R);

      if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)

        continue;

      VPValue *Incoming = PhiR->getOperand(1);

      if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||

          Incoming->getNumUsers() != 1)

        continue;

      PhiR->replaceAllUsesWith(PhiR->getOperand(0));

      PhiR->eraseFromParent();

      Incoming->getDefiningRecipe()->eraseFromParent();

    }

  }

}


static VPScalarIVStepsRecipe *

createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,

                    Instruction::BinaryOps InductionOpcode,

                    FPMathOperator *FPBinOp, Instruction *TruncI,

                    VPValue *StartV, VPValue *Step, DebugLoc DL,

                    VPBuilder &Builder) {

  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();

  VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(

      Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");


  // Truncate base induction if needed.

  VPTypeAnalysis TypeInfo(Plan);

  Type *ResultTy = TypeInfo.inferScalarType(BaseIV);

  if (TruncI) {

    Type *TruncTy = TruncI->getType();

    assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&

           "Not truncating.");

    assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");

    BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);

    ResultTy = TruncTy;

  }


  // Truncate step if needed.

  Type *StepTy = TypeInfo.inferScalarType(Step);

  if (ResultTy != StepTy) {

    assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&

           "Not truncating.");

    assert(StepTy->isIntegerTy() && "Truncation requires an integer type");

    auto *VecPreheader =

        cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());

    VPBuilder::InsertPointGuard Guard(Builder);

    Builder.setInsertPoint(VecPreheader);

    Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);

  }

  return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,

                                     &Plan.getVF(), DL);

}


static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {

  SetVector<VPUser *> Users(llvm::from_range, V->users());

  for (unsigned I = 0; I != Users.size(); ++I) {

    VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);

    if (isa<VPHeaderPHIRecipe>(Cur))

      continue;

    for (VPValue *V : Cur->definedValues())

      Users.insert_range(V->users());

  }

  return Users.takeVector();

}


/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd

/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as

/// VPWidenPointerInductionRecipe will generate vectors only. If some users

/// require vectors while other require scalars, the scalar uses need to extract

/// the scalars from the generated vectors (Note that this is different to how

/// int/fp inductions are handled). Legalize extract-from-ends using uniform

/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so

/// the correct end value is available. Also optimize

/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by

/// providing them scalar steps built on the canonical scalar IV and update the

/// original IV's users. This is an optional optimization to reduce the needs of

/// vector extracts.

static void legalizeAndOptimizeInductions(VPlan &Plan) {

  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();

  VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());

  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {

    auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);

    if (!PhiR)

      continue;


    // Try to narrow wide and replicating recipes to uniform recipes, based on

    // VPlan analysis.

    // TODO: Apply to all recipes in the future, to replace legacy uniformity

    // analysis.

    auto Users = collectUsersRecursively(PhiR);

    for (VPUser *U : reverse(Users)) {

      auto *Def = dyn_cast<VPSingleDefRecipe>(U);

      auto *RepR = dyn_cast<VPReplicateRecipe>(U);

      // Skip recipes that shouldn't be narrowed.

      if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||

          Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||

          (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))

        continue;


      // Skip recipes that may have other lanes than their first used.

      if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))

        continue;


      auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),

                                          Def->operands(), /*IsUniform*/ true);

      Clone->insertAfter(Def);

      Def->replaceAllUsesWith(Clone);

    }


    // Replace wide pointer inductions which have only their scalars used by

    // PtrAdd(IndStart, ScalarIVSteps (0, Step)).

    if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {

      if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))

        continue;


      const InductionDescriptor &ID = PtrIV->getInductionDescriptor();

      VPValue *StartV =

          Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));

      VPValue *StepV = PtrIV->getOperand(1);

      VPScalarIVStepsRecipe *Steps = createScalarIVSteps(

          Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,

          nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);


      VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,

                                             PtrIV->getDebugLoc(), "next.gep");


      PtrIV->replaceAllUsesWith(PtrAdd);

      continue;

    }


    // Replace widened induction with scalar steps for users that only use

    // scalars.

    auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);

    if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {

          return U->usesScalars(WideIV);

        }))

      continue;


    const InductionDescriptor &ID = WideIV->getInductionDescriptor();

    VPScalarIVStepsRecipe *Steps = createScalarIVSteps(

        Plan, ID.getKind(), ID.getInductionOpcode(),

        dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),

        WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),

        WideIV->getDebugLoc(), Builder);


    // Update scalar users of IV to use Step instead.

    if (!HasOnlyVectorVFs)

      WideIV->replaceAllUsesWith(Steps);

    else

      WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {

        return U.usesScalars(WideIV);

      });

  }

}


/// Check if \p VPV is an untruncated wide induction, either before or after the

/// increment. If so return the header IV (before the increment), otherwise

/// return null.

static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV,

                                                  ScalarEvolution &SE) {

  auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);

  if (WideIV) {

    // VPV itself is a wide induction, separately compute the end value for exit

    // users if it is not a truncated IV.

    auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);

    return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;

  }


  // Check if VPV is an optimizable induction increment.

  VPRecipeBase *Def = VPV->getDefiningRecipe();

  if (!Def || Def->getNumOperands() != 2)

    return nullptr;

  WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));

  if (!WideIV)

    WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));

  if (!WideIV)

    return nullptr;


  auto IsWideIVInc = [&]() {

    auto &ID = WideIV->getInductionDescriptor();


    // Check if VPV increments the induction by the induction step.

    VPValue *IVStep = WideIV->getStepValue();

    switch (ID.getInductionOpcode()) {

    case Instruction::Add:

      return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));

    case Instruction::FAdd:

      return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),

                                                      m_Specific(IVStep)));

    case Instruction::FSub:

      return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),

                                                    m_Specific(IVStep)));

    case Instruction::Sub: {

      // IVStep will be the negated step of the subtraction. Check if Step == -1

      // * IVStep.

      VPValue *Step;

      if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))

        return false;

      const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);

      const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);

      return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&

             !isa<SCEVCouldNotCompute>(StepSCEV) &&

             IVStepSCEV == SE.getNegativeSCEV(StepSCEV);

    }

    default:

      return ID.getKind() == InductionDescriptor::IK_PtrInduction &&

             match(VPV, m_GetElementPtr(m_Specific(WideIV),

                                        m_Specific(WideIV->getStepValue())));

    }

    llvm_unreachable("should have been covered by switch above");

  };

  return IsWideIVInc() ? WideIV : nullptr;

}


/// Attempts to optimize the induction variable exit values for users in the

/// early exit block.

static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,

                                               VPTypeAnalysis &TypeInfo,

                                               VPBlockBase *PredVPBB,

                                               VPValue *Op,

                                               ScalarEvolution &SE) {

  VPValue *Incoming, *Mask;

  if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(

                     m_VPInstruction<VPInstruction::FirstActiveLane>(

                         m_VPValue(Mask)),

                     m_VPValue(Incoming))))

    return nullptr;


  auto *WideIV = getOptimizableIVOf(Incoming, SE);

  if (!WideIV)

    return nullptr;


  auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);

  if (WideIntOrFp && WideIntOrFp->getTruncInst())

    return nullptr;


  // Calculate the final index.

  VPValue *EndValue = Plan.getCanonicalIV();

  auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();

  VPBuilder B(cast<VPBasicBlock>(PredVPBB));


  DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();

  VPValue *FirstActiveLane =

      B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);

  Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);

  FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,

                                              FirstActiveLaneType, DL);

  EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);


  // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it

  // changed it means the exit is using the incremented value, so we need to

  // add the step.

  if (Incoming != WideIV) {

    VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));

    EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);

  }


  if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {

    const InductionDescriptor &ID = WideIV->getInductionDescriptor();

    VPValue *Start = WideIV->getStartValue();

    VPValue *Step = WideIV->getStepValue();

    EndValue = B.createDerivedIV(

        ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),

        Start, EndValue, Step);

  }


  return EndValue;

}


/// Attempts to optimize the induction variable exit values for users in the

/// exit block coming from the latch in the original scalar loop.

static VPValue *optimizeLatchExitInductionUser(

    VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,

    DenseMap<VPValue *, VPValue *> &EndValues, ScalarEvolution &SE) {

  VPValue *Incoming;

  if (!match(Op, m_ExtractLastElement(m_VPValue(Incoming))))

    return nullptr;


  auto *WideIV = getOptimizableIVOf(Incoming, SE);

  if (!WideIV)

    return nullptr;


  VPValue *EndValue = EndValues.lookup(WideIV);

  assert(EndValue && "end value must have been pre-computed");


  // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it

  // changed it means the exit is using the incremented value, so we don't

  // need to subtract the step.

  if (Incoming != WideIV)

    return EndValue;


  // Otherwise, subtract the step from the EndValue.

  VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());

  VPValue *Step = WideIV->getStepValue();

  Type *ScalarTy = TypeInfo.inferScalarType(WideIV);

  if (ScalarTy->isIntegerTy())

    return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");

  if (ScalarTy->isPointerTy()) {

    Type *StepTy = TypeInfo.inferScalarType(Step);

    auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));

    return B.createPtrAdd(EndValue,

                          B.createNaryOp(Instruction::Sub, {Zero, Step}), {},

                          "ind.escape");

  }

  if (ScalarTy->isFloatingPointTy()) {

    const auto &ID = WideIV->getInductionDescriptor();

    return B.createNaryOp(

        ID.getInductionBinOp()->getOpcode() == Instruction::FAdd

            ? Instruction::FSub

            : Instruction::FAdd,

        {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});

  }

  llvm_unreachable("all possible induction types must be handled");

  return nullptr;

}


void VPlanTransforms::optimizeInductionExitUsers(

    VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,

    ScalarEvolution &SE) {

  VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();

  VPTypeAnalysis TypeInfo(Plan);

  for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {

    for (VPRecipeBase &R : ExitVPBB->phis()) {

      auto *ExitIRI = cast<VPIRPhi>(&R);


      for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {

        VPValue *Escape = nullptr;

        if (PredVPBB == MiddleVPBB)

          Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,

                                                  ExitIRI->getOperand(Idx),

                                                  EndValues, SE);

        else

          Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,

                                                  ExitIRI->getOperand(Idx), SE);

        if (Escape)

          ExitIRI->setOperand(Idx, Escape);

      }

    }

  }

}


/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing

/// them with already existing recipes expanding the same SCEV expression.

static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {

  DenseMap<const SCEV *, VPValue *> SCEV2VPV;


  for (VPRecipeBase &R :

       make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {

    auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);

    if (!ExpR)

      continue;


    const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);

    if (Inserted)

      continue;

    ExpR->replaceAllUsesWith(V->second);

    ExpR->eraseFromParent();

  }

}


static void recursivelyDeleteDeadRecipes(VPValue *V) {

  SmallVector<VPValue *> WorkList;

  SmallPtrSet<VPValue *, 8> Seen;

  WorkList.push_back(V);


  while (!WorkList.empty()) {

    VPValue *Cur = WorkList.pop_back_val();

    if (!Seen.insert(Cur).second)

      continue;

    VPRecipeBase *R = Cur->getDefiningRecipe();

    if (!R)

      continue;

    if (!isDeadRecipe(*R))

      continue;

    WorkList.append(R->op_begin(), R->op_end());

    R->eraseFromParent();

  }

}


/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a

/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are

/// foldable live-ins.

static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,

                               ArrayRef<VPValue *> Operands,

                               const DataLayout &DL, VPTypeAnalysis &TypeInfo) {

  SmallVector<Value *, 4> Ops;

  for (VPValue *Op : Operands) {

    if (!Op->isLiveIn() || !Op->getLiveInIRValue())

      return nullptr;

    Ops.push_back(Op->getLiveInIRValue());

  }


  InstSimplifyFolder Folder(DL);

  if (Instruction::isBinaryOp(Opcode))

    return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],

                            Ops[1]);

  if (Instruction::isCast(Opcode))

    return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],

                           TypeInfo.inferScalarType(R.getVPSingleValue()));

  switch (Opcode) {

  case VPInstruction::LogicalAnd:

    return Folder.FoldSelect(Ops[0], Ops[1],

                             ConstantInt::getNullValue(Ops[1]->getType()));

  case VPInstruction::Not:

    return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],

                            Constant::getAllOnesValue(Ops[0]->getType()));

  case Instruction::Select:

    return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);

  case Instruction::ICmp:

  case Instruction::FCmp:

    return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],

                          Ops[1]);

  case Instruction::GetElementPtr: {

    auto &RFlags = cast<VPRecipeWithIRFlags>(R);

    auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());

    return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),

                          RFlags.getGEPNoWrapFlags());

  }

  case VPInstruction::PtrAdd:

  case VPInstruction::WidePtrAdd:

    return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],

                          Ops[1],

                          cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());

  // An extract of a live-in is an extract of a broadcast, so return the

  // broadcasted element.

  case Instruction::ExtractElement:

    assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");

    return Ops[0];

  }

  return nullptr;

}


/// Try to simplify recipe \p R.

static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {

  VPlan *Plan = R.getParent()->getPlan();


  auto *Def = dyn_cast<VPSingleDefRecipe>(&R);

  if (!Def)

    return;


  // Simplification of live-in IR values for SingleDef recipes using

  // InstSimplifyFolder.

  if (TypeSwitch<VPRecipeBase *, bool>(&R)

          .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,

                VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) {

            const DataLayout &DL =

                Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();

            Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,

                                        TypeInfo);

            if (V)

              I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));

            return V;

          })

          .Default([](auto *) { return false; }))

    return;


  // Fold PredPHI LiveIn -> LiveIn.

  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {

    VPValue *Op = PredPHI->getOperand(0);

    if (Op->isLiveIn())

      PredPHI->replaceAllUsesWith(Op);

  }


  VPValue *A;

  if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {

    Type *TruncTy = TypeInfo.inferScalarType(Def);

    Type *ATy = TypeInfo.inferScalarType(A);

    if (TruncTy == ATy) {

      Def->replaceAllUsesWith(A);

    } else {

      // Don't replace a scalarizing recipe with a widened cast.

      if (isa<VPReplicateRecipe>(Def))

        return;

      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {


        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))

                                 ? Instruction::SExt

                                 : Instruction::ZExt;

        auto *VPC =

            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);

        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {

          // UnderlyingExt has distinct return type, used to retain legacy cost.

          VPC->setUnderlyingValue(UnderlyingExt);

        }

        VPC->insertBefore(&R);

        Def->replaceAllUsesWith(VPC);

      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {

        auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);

        VPC->insertBefore(&R);

        Def->replaceAllUsesWith(VPC);

      }

    }

#ifndef NDEBUG

    // Verify that the cached type info is for both A and its users is still

    // accurate by comparing it to freshly computed types.

    VPTypeAnalysis TypeInfo2(*Plan);

    assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));

    for (VPUser *U : A->users()) {

      auto *R = cast<VPRecipeBase>(U);

      for (VPValue *VPV : R->definedValues())

        assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));

    }

#endif

  }


  // Simplify (X && Y) || (X && !Y) -> X.

  // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X

  // && (Y || Z) and (X || !X) into true. This requires queuing newly created

  // recipes to be visited during simplification.

  VPValue *X, *Y;

  if (match(Def,

            m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),

                         m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) {

    Def->replaceAllUsesWith(X);

    Def->eraseFromParent();

    return;

  }


  // OR x, 1 -> 1.

  if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {

    Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)

                                                    : Def->getOperand(0));

    Def->eraseFromParent();

    return;

  }


  // AND x, 0 -> 0

  if (match(&R, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {

    Def->replaceAllUsesWith(R.getOperand(0) == X ? R.getOperand(1)

                                                 : R.getOperand(0));

    return;

  }


  if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))

    return Def->replaceAllUsesWith(X);


  // select !c, x, y -> select c, y, x

  VPValue *C;

  if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {

    Def->setOperand(0, C);

    Def->setOperand(1, Y);

    Def->setOperand(2, X);

    return;

  }


  if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))

    return Def->replaceAllUsesWith(A);


  if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))

    return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)

                                                        : R.getOperand(0));


  if (match(Def, m_Not(m_VPValue(A)))) {

    if (match(A, m_Not(m_VPValue(A))))

      return Def->replaceAllUsesWith(A);


    // Try to fold Not into compares by adjusting the predicate in-place.

    CmpPredicate Pred;

    if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {

      auto *Cmp = cast<VPRecipeWithIRFlags>(A);

      if (all_of(Cmp->users(), [&Cmp](VPUser *U) {

            return match(U, m_CombineOr(m_Not(m_Specific(Cmp)),

                                        m_Select(m_Specific(Cmp), m_VPValue(),

                                                 m_VPValue())));

          })) {

        Cmp->setPredicate(CmpInst::getInversePredicate(Pred));

        for (VPUser *U : to_vector(Cmp->users())) {

          auto *R = cast<VPSingleDefRecipe>(U);

          if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {

            // select (cmp pred), x, y -> select (cmp inv_pred), y, x

            R->setOperand(1, Y);

            R->setOperand(2, X);

          } else {

            // not (cmp pred) -> cmp inv_pred

            assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");

            R->replaceAllUsesWith(Cmp);

          }

        }

        // If Cmp doesn't have a debug location, use the one from the negation,

        // to preserve the location.

        if (!Cmp->getDebugLoc() && R.getDebugLoc())

          Cmp->setDebugLoc(R.getDebugLoc());

      }

    }

  }


  // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.

  if ((match(Def,

             m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||

       match(Def,

             m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&

      TypeInfo.inferScalarType(Def->getOperand(1)) ==

          TypeInfo.inferScalarType(Def))

    return Def->replaceAllUsesWith(Def->getOperand(1));


  if (match(Def, m_VPInstruction<VPInstruction::WideIVStep>(

                     m_VPValue(X), m_SpecificInt(1)))) {

    Type *WideStepTy = TypeInfo.inferScalarType(Def);

    if (TypeInfo.inferScalarType(X) != WideStepTy)

      X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy);

    Def->replaceAllUsesWith(X);

    return;

  }


  // For i1 vp.merges produced by AnyOf reductions:

  // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl

  if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),

                                                  m_VPValue(X), m_VPValue())) &&

      match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&

      TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {

    Def->setOperand(1, Def->getOperand(0));

    Def->setOperand(0, Y);

    return;

  }


  if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {

    if (Phi->getOperand(0) == Phi->getOperand(1))

      Def->replaceAllUsesWith(Phi->getOperand(0));

    return;

  }


  // Look through ExtractLastElement (BuildVector ....).

  if (match(&R, m_ExtractLastElement(m_BuildVector()))) {

    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));

    Def->replaceAllUsesWith(

        BuildVector->getOperand(BuildVector->getNumOperands() - 1));

    return;

  }


  // Look through ExtractPenultimateElement (BuildVector ....).

  if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(

                    m_BuildVector()))) {

    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));

    Def->replaceAllUsesWith(

        BuildVector->getOperand(BuildVector->getNumOperands() - 2));

    return;

  }


  if (auto *Phi = dyn_cast<VPPhi>(Def)) {

    if (Phi->getNumOperands() == 1)

      Phi->replaceAllUsesWith(Phi->getOperand(0));

    return;

  }


  // Some simplifications can only be applied after unrolling. Perform them

  // below.

  if (!Plan->isUnrolled())

    return;


  // VPVectorPointer for part 0 can be replaced by their start pointer.

  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {

    if (VecPtr->isFirstPart()) {

      VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));

      return;

    }

  }


  // VPScalarIVSteps for part 0 can be replaced by their start value, if only

  // the first lane is demanded.

  if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {

    if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {

      Steps->replaceAllUsesWith(Steps->getOperand(0));

      return;

    }

  }

  // Simplify redundant ReductionStartVector recipes after unrolling.

  VPValue *StartV;

  if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(

                     m_VPValue(StartV), m_VPValue(), m_VPValue()))) {

    Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {

      auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);

      return PhiR && PhiR->isInLoop();

    });

    return;

  }


  if (match(Def, m_ExtractLastElement(m_Broadcast(m_VPValue(A))))) {

    Def->replaceAllUsesWith(A);

    return;

  }


  VPInstruction *OpVPI;

  if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) &&

      OpVPI->isVectorToScalar()) {

    Def->replaceAllUsesWith(OpVPI);

    return;

  }

}


void VPlanTransforms::simplifyRecipes(VPlan &Plan) {

  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(

      Plan.getEntry());

  VPTypeAnalysis TypeInfo(Plan);

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      simplifyRecipe(R, TypeInfo);

    }

  }

}


static void narrowToSingleScalarRecipes(VPlan &Plan) {

  if (Plan.hasScalarVFOnly())

    return;


  // Try to narrow wide and replicating recipes to single scalar recipes,

  // based on VPlan analysis. Only process blocks in the loop region for now,

  // without traversing into nested regions, as recipes in replicate regions

  // cannot be converted yet.

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {

      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))

        continue;

      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

      if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))

        continue;


      auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);

      // Skip recipes that aren't single scalars or don't have only their

      // scalar results used. In the latter case, we would introduce extra

      // broadcasts.

      if (!vputils::isSingleScalar(RepOrWidenR) ||

          !vputils::onlyScalarValuesUsed(RepOrWidenR))

        continue;


      auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),

                                          RepOrWidenR->operands(),

                                          true /*IsSingleScalar*/);

      Clone->insertBefore(RepOrWidenR);

      RepOrWidenR->replaceAllUsesWith(Clone);

    }

  }

}


/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes

/// to make sure the masks are simplified.

static void simplifyBlends(VPlan &Plan) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      auto *Blend = dyn_cast<VPBlendRecipe>(&R);

      if (!Blend)

        continue;


      // Try to remove redundant blend recipes.

      SmallPtrSet<VPValue *, 4> UniqueValues;

      if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))

        UniqueValues.insert(Blend->getIncomingValue(0));

      for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)

        if (!match(Blend->getMask(I), m_False()))

          UniqueValues.insert(Blend->getIncomingValue(I));


      if (UniqueValues.size() == 1) {

        Blend->replaceAllUsesWith(*UniqueValues.begin());

        Blend->eraseFromParent();

        continue;

      }


      if (Blend->isNormalized())

        continue;


      // Normalize the blend so its first incoming value is used as the initial

      // value with the others blended into it.


      unsigned StartIndex = 0;

      for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {

        // If a value's mask is used only by the blend then is can be deadcoded.

        // TODO: Find the most expensive mask that can be deadcoded, or a mask

        // that's used by multiple blends where it can be removed from them all.

        VPValue *Mask = Blend->getMask(I);

        if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {

          StartIndex = I;

          break;

        }

      }


      SmallVector<VPValue *, 4> OperandsWithMask;

      OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));


      for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {

        if (I == StartIndex)

          continue;

        OperandsWithMask.push_back(Blend->getIncomingValue(I));

        OperandsWithMask.push_back(Blend->getMask(I));

      }


      auto *NewBlend =

          new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),

                            OperandsWithMask, Blend->getDebugLoc());

      NewBlend->insertBefore(&R);


      VPValue *DeadMask = Blend->getMask(StartIndex);

      Blend->replaceAllUsesWith(NewBlend);

      Blend->eraseFromParent();

      recursivelyDeleteDeadRecipes(DeadMask);


      /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.

      VPValue *NewMask;

      if (NewBlend->getNumOperands() == 3 &&

          match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {

        VPValue *Inc0 = NewBlend->getOperand(0);

        VPValue *Inc1 = NewBlend->getOperand(1);

        VPValue *OldMask = NewBlend->getOperand(2);

        NewBlend->setOperand(0, Inc1);

        NewBlend->setOperand(1, Inc0);

        NewBlend->setOperand(2, NewMask);

        if (OldMask->getNumUsers() == 0)

          cast<VPInstruction>(OldMask)->eraseFromParent();

      }

    }

  }

}


/// Optimize the width of vector induction variables in \p Plan based on a known

/// constant Trip Count, \p BestVF and \p BestUF.

static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,

                                                     ElementCount BestVF,

                                                     unsigned BestUF) {

  // Only proceed if we have not completely removed the vector region.

  if (!Plan.getVectorLoopRegion())

    return false;


  if (!Plan.getTripCount()->isLiveIn())

    return false;

  auto *TC = dyn_cast_if_present<ConstantInt>(

      Plan.getTripCount()->getUnderlyingValue());

  if (!TC || !BestVF.isFixed())

    return false;


  // Calculate the minimum power-of-2 bit width that can fit the known TC, VF

  // and UF. Returns at least 8.

  auto ComputeBitWidth = [](APInt TC, uint64_t Align) {

    APInt AlignedTC =

        Align * APIntOps::RoundingUDiv(TC, APInt(TC.getBitWidth(), Align),

                                       APInt::Rounding::UP);

    APInt MaxVal = AlignedTC - 1;

    return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);

  };

  unsigned NewBitWidth =

      ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);


  LLVMContext &Ctx = Plan.getContext();

  auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);


  bool MadeChange = false;


  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {

    auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);


    // Currently only handle canonical IVs as it is trivial to replace the start

    // and stop values, and we currently only perform the optimization when the

    // IV has a single use.

    if (!WideIV || !WideIV->isCanonical() ||

        WideIV->hasMoreThanOneUniqueUser() ||

        NewIVTy == WideIV->getScalarType())

      continue;


    // Currently only handle cases where the single user is a header-mask

    // comparison with the backedge-taken-count.

    if (!match(*WideIV->user_begin(),

               m_ICmp(m_Specific(WideIV),

                      m_Broadcast(

                          m_Specific(Plan.getOrCreateBackedgeTakenCount())))))

      continue;


    // Update IV operands and comparison bound to use new narrower type.

    auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));

    WideIV->setStartValue(NewStart);

    auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));

    WideIV->setStepValue(NewStep);


    auto *NewBTC = new VPWidenCastRecipe(

        Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);

    Plan.getVectorPreheader()->appendRecipe(NewBTC);

    auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());

    Cmp->setOperand(1, NewBTC);


    MadeChange = true;

  }


  return MadeChange;

}


/// Return true if \p Cond is known to be true for given \p BestVF and \p

/// BestUF.

static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,

                                      ElementCount BestVF, unsigned BestUF,

                                      ScalarEvolution &SE) {

  if (match(Cond, m_BinaryOr(m_VPValue(), m_VPValue())))

    return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,

                                                          &SE](VPValue *C) {

      return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);

    });


  auto *CanIV = Plan.getCanonicalIV();

  if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ,

                                  m_Specific(CanIV->getBackedgeValue()),

                                  m_Specific(&Plan.getVectorTripCount()))))

    return false;


  // The compare checks CanIV + VFxUF == vector trip count. The vector trip

  // count is not conveniently available as SCEV so far, so we compare directly

  // against the original trip count. This is stricter than necessary, as we

  // will only return true if the trip count == vector trip count.

  const SCEV *VectorTripCount =

      vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE);

  if (isa<SCEVCouldNotCompute>(VectorTripCount))

    VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);

  assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&

         "Trip count SCEV must be computable");

  ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);

  const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);

  return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);

}


/// Try to simplify the branch condition of \p Plan. This may restrict the

/// resulting plan to \p BestVF and \p BestUF.

static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,

                                              unsigned BestUF,

                                              PredicatedScalarEvolution &PSE) {

  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();

  auto *Term = &ExitingVPBB->back();

  VPValue *Cond;

  ScalarEvolution &SE = *PSE.getSE();

  if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||

      match(Term, m_BranchOnCond(

                      m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {

    // Try to simplify the branch condition if TC <= VF * UF when the latch

    // terminator is   BranchOnCount or BranchOnCond where the input is

    // Not(ActiveLaneMask).

    const SCEV *TripCount =

        vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);

    assert(!isa<SCEVCouldNotCompute>(TripCount) &&

           "Trip count SCEV must be computable");

    ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);

    const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);

    if (TripCount->isZero() ||

        !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))

      return false;

  } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {

    // For BranchOnCond, check if we can prove the condition to be true using VF

    // and UF.

    if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))

      return false;

  } else {

    return false;

  }


  // The vector loop region only executes once. If possible, completely remove

  // the region, otherwise replace the terminator controlling the latch with

  // (BranchOnCond true).

  // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add

  // support for other non-canonical widen induction recipes (e.g.,

  // VPWidenPointerInductionRecipe).

  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());

  if (all_of(Header->phis(), [](VPRecipeBase &Phi) {

        if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))

          return R->isCanonical();

        return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,

                   VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);

      })) {

    for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {

      if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {

        VPBuilder Builder(Plan.getVectorPreheader());

        VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},

                                              R->getScalarType());

        HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);

        HeaderR.eraseFromParent();

        continue;

      }

      auto *Phi = cast<VPPhiAccessors>(&HeaderR);

      HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));

      HeaderR.eraseFromParent();

    }


    VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();

    VPBlockBase *Exit = VectorRegion->getSingleSuccessor();

    VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);

    VPBlockUtils::disconnectBlocks(VectorRegion, Exit);


    for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))

      B->setParent(nullptr);


    VPBlockUtils::connectBlocks(Preheader, Header);

    VPBlockUtils::connectBlocks(ExitingVPBB, Exit);

    VPlanTransforms::simplifyRecipes(Plan);

  } else {

    // The vector region contains header phis for which we cannot remove the

    // loop region yet.

    auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},

                                  Term->getDebugLoc());

    ExitingVPBB->appendRecipe(BOC);

  }


  Term->eraseFromParent();


  return true;

}


void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,

                                         unsigned BestUF,

                                         PredicatedScalarEvolution &PSE) {

  assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");

  assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");


  bool MadeChange =

      simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);

  MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);


  if (MadeChange) {

    Plan.setVF(BestVF);

    assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");

  }

  // TODO: Further simplifications are possible

  //      1. Replace inductions with constants.

  //      2. Replace vector loop region with VPBasicBlock.

}


/// Sink users of \p FOR after the recipe defining the previous value \p

/// Previous of the recurrence. \returns true if all users of \p FOR could be

/// re-arranged as needed or false if it is not possible.

static bool

sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,

                                 VPRecipeBase *Previous,

                                 VPDominatorTree &VPDT) {

  // Collect recipes that need sinking.

  SmallVector<VPRecipeBase *> WorkList;

  SmallPtrSet<VPRecipeBase *, 8> Seen;

  Seen.insert(Previous);

  auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {

    // The previous value must not depend on the users of the recurrence phi. In

    // that case, FOR is not a fixed order recurrence.

    if (SinkCandidate == Previous)

      return false;


    if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||

        !Seen.insert(SinkCandidate).second ||

        VPDT.properlyDominates(Previous, SinkCandidate))

      return true;


    if (SinkCandidate->mayHaveSideEffects())

      return false;


    WorkList.push_back(SinkCandidate);

    return true;

  };


  // Recursively sink users of FOR after Previous.

  WorkList.push_back(FOR);

  for (unsigned I = 0; I != WorkList.size(); ++I) {

    VPRecipeBase *Current = WorkList[I];

    assert(Current->getNumDefinedValues() == 1 &&

           "only recipes with a single defined value expected");


    for (VPUser *User : Current->getVPSingleValue()->users()) {

      if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))

        return false;

    }

  }


  // Keep recipes to sink ordered by dominance so earlier instructions are

  // processed first.

  sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {

    return VPDT.properlyDominates(A, B);

  });


  for (VPRecipeBase *SinkCandidate : WorkList) {

    if (SinkCandidate == FOR)

      continue;


    SinkCandidate->moveAfter(Previous);

    Previous = SinkCandidate;

  }

  return true;

}


/// Try to hoist \p Previous and its operands before all users of \p FOR.

static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,

                                        VPRecipeBase *Previous,

                                        VPDominatorTree &VPDT) {

  if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())

    return false;


  // Collect recipes that need hoisting.

  SmallVector<VPRecipeBase *> HoistCandidates;

  SmallPtrSet<VPRecipeBase *, 8> Visited;

  VPRecipeBase *HoistPoint = nullptr;

  // Find the closest hoist point by looking at all users of FOR and selecting

  // the recipe dominating all other users.

  for (VPUser *U : FOR->users()) {

    auto *R = cast<VPRecipeBase>(U);

    if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))

      HoistPoint = R;

  }

  assert(all_of(FOR->users(),

                [&VPDT, HoistPoint](VPUser *U) {

                  auto *R = cast<VPRecipeBase>(U);

                  return HoistPoint == R ||

                         VPDT.properlyDominates(HoistPoint, R);

                }) &&

         "HoistPoint must dominate all users of FOR");


  auto NeedsHoisting = [HoistPoint, &VPDT,

                        &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {

    VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();

    if (!HoistCandidate)

      return nullptr;

    VPRegionBlock *EnclosingLoopRegion =

        HoistCandidate->getParent()->getEnclosingLoopRegion();

    assert((!HoistCandidate->getParent()->getParent() ||

            HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&

           "CFG in VPlan should still be flat, without replicate regions");

    // Hoist candidate was already visited, no need to hoist.

    if (!Visited.insert(HoistCandidate).second)

      return nullptr;


    // Candidate is outside loop region or a header phi, dominates FOR users w/o

    // hoisting.

    if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))

      return nullptr;


    // If we reached a recipe that dominates HoistPoint, we don't need to

    // hoist the recipe.

    if (VPDT.properlyDominates(HoistCandidate, HoistPoint))

      return nullptr;

    return HoistCandidate;

  };

  auto CanHoist = [&](VPRecipeBase *HoistCandidate) {

    // Avoid hoisting candidates with side-effects, as we do not yet analyze

    // associated dependencies.

    return !HoistCandidate->mayHaveSideEffects();

  };


  if (!NeedsHoisting(Previous->getVPSingleValue()))

    return true;


  // Recursively try to hoist Previous and its operands before all users of FOR.

  HoistCandidates.push_back(Previous);


  for (unsigned I = 0; I != HoistCandidates.size(); ++I) {

    VPRecipeBase *Current = HoistCandidates[I];

    assert(Current->getNumDefinedValues() == 1 &&

           "only recipes with a single defined value expected");

    if (!CanHoist(Current))

      return false;


    for (VPValue *Op : Current->operands()) {

      // If we reach FOR, it means the original Previous depends on some other

      // recurrence that in turn depends on FOR. If that is the case, we would

      // also need to hoist recipes involving the other FOR, which may break

      // dependencies.

      if (Op == FOR)

        return false;


      if (auto *R = NeedsHoisting(Op))

        HoistCandidates.push_back(R);

    }

  }


  // Order recipes to hoist by dominance so earlier instructions are processed

  // first.

  sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {

    return VPDT.properlyDominates(A, B);

  });


  for (VPRecipeBase *HoistCandidate : HoistCandidates) {

    HoistCandidate->moveBefore(*HoistPoint->getParent(),

                               HoistPoint->getIterator());

  }


  return true;

}


bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,

                                                  VPBuilder &LoopBuilder) {

  VPDominatorTree VPDT;

  VPDT.recalculate(Plan);


  SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;

  for (VPRecipeBase &R :

       Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())

    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))

      RecurrencePhis.push_back(FOR);


  for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {

    SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;

    VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();

    // Fixed-order recurrences do not contain cycles, so this loop is guaranteed

    // to terminate.

    while (auto *PrevPhi =

               dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {

      assert(PrevPhi->getParent() == FOR->getParent());

      assert(SeenPhis.insert(PrevPhi).second);

      Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();

    }


    if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&

        !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))

      return false;


    // Introduce a recipe to combine the incoming and previous values of a

    // fixed-order recurrence.

    VPBasicBlock *InsertBlock = Previous->getParent();

    if (isa<VPHeaderPHIRecipe>(Previous))

      LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());

    else

      LoopBuilder.setInsertPoint(InsertBlock,

                                 std::next(Previous->getIterator()));


    auto *RecurSplice =

        LoopBuilder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,

                                 {FOR, FOR->getBackedgeValue()});


    FOR->replaceAllUsesWith(RecurSplice);

    // Set the first operand of RecurSplice to FOR again, after replacing

    // all users.

    RecurSplice->setOperand(0, FOR);

  }

  return true;

}


void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {

  for (VPRecipeBase &R :

       Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

    auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

    if (!PhiR)

      continue;

    RecurKind RK = PhiR->getRecurrenceKind();

    if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&

        RK != RecurKind::AddChainWithSubs)

      continue;


    for (VPUser *U : collectUsersRecursively(PhiR))

      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {

        RecWithFlags->dropPoisonGeneratingFlags();

      }

  }

}


/// Move loop-invariant recipes out of the vector loop region in \p Plan.

static void licm(VPlan &Plan) {

  VPBasicBlock *Preheader = Plan.getVectorPreheader();


  // Return true if we do not know how to (mechanically) hoist a given recipe

  // out of a loop region. Does not address legality concerns such as aliasing

  // or speculation safety.

  auto CannotHoistRecipe = [](VPRecipeBase &R) {

    // Allocas cannot be hoisted.

    auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

    return RepR && RepR->getOpcode() == Instruction::Alloca;

  };


  // Hoist any loop invariant recipes from the vector loop region to the

  // preheader. Preform a shallow traversal of the vector loop region, to

  // exclude recipes in replicate regions.

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(LoopRegion->getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (CannotHoistRecipe(R))

        continue;

      // TODO: Relax checks in the future, e.g. we could also hoist reads, if

      // their memory location is not modified in the vector loop.

      if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||

          any_of(R.operands(), [](VPValue *Op) {

            return !Op->isDefinedOutsideLoopRegions();

          }))

        continue;

      R.moveBefore(*Preheader, Preheader->end());

    }

  }

}


void VPlanTransforms::truncateToMinimalBitwidths(

    VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {

  // Keep track of created truncates, so they can be re-used. Note that we

  // cannot use RAUW after creating a new truncate, as this would could make

  // other uses have different types for their operands, making them invalidly

  // typed.

  DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;

  VPTypeAnalysis TypeInfo(Plan);

  VPBasicBlock *PH = Plan.getVectorPreheader();

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getVectorLoopRegion()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,

               VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(

              &R))

        continue;


      VPValue *ResultVPV = R.getVPSingleValue();

      auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());

      unsigned NewResSizeInBits = MinBWs.lookup(UI);

      if (!NewResSizeInBits)

        continue;


      // If the value wasn't vectorized, we must maintain the original scalar

      // type. Skip those here, after incrementing NumProcessedRecipes. Also

      // skip casts which do not need to be handled explicitly here, as

      // redundant casts will be removed during recipe simplification.

      if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R))

        continue;


      Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);

      unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();

      assert(OldResTy->isIntegerTy() && "only integer types supported");

      (void)OldResSizeInBits;


      auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);


      // Any wrapping introduced by shrinking this operation shouldn't be

      // considered undefined behavior. So, we can't unconditionally copy

      // arithmetic wrapping flags to VPW.

      if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))

        VPW->dropPoisonGeneratingFlags();


      if (OldResSizeInBits != NewResSizeInBits &&

          !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {

        // Extend result to original width.

        auto *Ext =

            new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);

        Ext->insertAfter(&R);

        ResultVPV->replaceAllUsesWith(Ext);

        Ext->setOperand(0, ResultVPV);

        assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");

      } else {

        assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&

               "Only ICmps should not need extending the result.");

      }


      assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");

      if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(&R))

        continue;


      // Shrink operands by introducing truncates as needed.

      unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;

      for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {

        auto *Op = R.getOperand(Idx);

        unsigned OpSizeInBits =

            TypeInfo.inferScalarType(Op)->getScalarSizeInBits();

        if (OpSizeInBits == NewResSizeInBits)

          continue;

        assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");

        auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);

        VPWidenCastRecipe *NewOp =

            IterIsEmpty

                ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)

                : ProcessedIter->second;

        R.setOperand(Idx, NewOp);

        if (!IterIsEmpty)

          continue;

        ProcessedIter->second = NewOp;

        if (!Op->isLiveIn()) {

          NewOp->insertBefore(&R);

        } else {

          PH->appendRecipe(NewOp);

        }

      }


    }

  }

}


void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getEntry()))) {

    VPValue *Cond;

    if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||

        !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))

      continue;


    unsigned RemovedIdx;

    if (match(Cond, m_True()))

      RemovedIdx = 1;

    else if (match(Cond, m_False()))

      RemovedIdx = 0;

    else

      continue;


    VPBasicBlock *RemovedSucc =

        cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);

    assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&

           "There must be a single edge between VPBB and its successor");

    // Values coming from VPBB into phi recipes of RemoveSucc are removed from

    // these recipes.

    for (VPRecipeBase &R : RemovedSucc->phis())

      cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);


    // Disconnect blocks and remove the terminator. RemovedSucc will be deleted

    // automatically on VPlan destruction if it becomes unreachable.

    VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);

    VPBB->back().eraseFromParent();

  }

}


void VPlanTransforms::optimize(VPlan &Plan) {

  runPass(removeRedundantCanonicalIVs, Plan);

  runPass(removeRedundantInductionCasts, Plan);


  runPass(simplifyRecipes, Plan);

  runPass(simplifyBlends, Plan);

  runPass(removeDeadRecipes, Plan);

  runPass(narrowToSingleScalarRecipes, Plan);

  runPass(legalizeAndOptimizeInductions, Plan);

  runPass(removeRedundantExpandSCEVRecipes, Plan);

  runPass(simplifyRecipes, Plan);

  runPass(removeBranchOnConst, Plan);

  runPass(removeDeadRecipes, Plan);


  runPass(createAndOptimizeReplicateRegions, Plan);

  runPass(mergeBlocksIntoPredecessors, Plan);

  runPass(licm, Plan);

}


// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace

// the loop terminator with a branch-on-cond recipe with the negated

// active-lane-mask as operand. Note that this turns the loop into an

// uncountable one. Only the existing terminator is replaced, all other existing

// recipes/users remain unchanged, except for poison-generating flags being

// dropped from the canonical IV increment. Return the created

// VPActiveLaneMaskPHIRecipe.

//

// The function uses the following definitions:

//

//  %TripCount = DataWithControlFlowWithoutRuntimeCheck ?

//    calculate-trip-count-minus-VF (original TC) : original TC

//  %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?

//     CanonicalIVPhi : CanonicalIVIncrement

//  %StartV is the canonical induction start value.

//

// The function adds the following recipes:

//

// vector.ph:

//   %TripCount = calculate-trip-count-minus-VF (original TC)

//       [if DataWithControlFlowWithoutRuntimeCheck]

//   %EntryInc = canonical-iv-increment-for-part %StartV

//   %EntryALM = active-lane-mask %EntryInc, %TripCount

//

// vector.body:

//   ...

//   %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]

//   ...

//   %InLoopInc = canonical-iv-increment-for-part %IncrementValue

//   %ALM = active-lane-mask %InLoopInc, TripCount

//   %Negated = Not %ALM

//   branch-on-cond %Negated

//

static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(

    VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {

  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *EB = TopRegion->getExitingBasicBlock();

  auto *CanonicalIVPHI = Plan.getCanonicalIV();

  VPValue *StartV = CanonicalIVPHI->getStartValue();


  auto *CanonicalIVIncrement =

      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());

  // TODO: Check if dropping the flags is needed if

  // !DataAndControlFlowWithoutRuntimeCheck.

  CanonicalIVIncrement->dropPoisonGeneratingFlags();

  DebugLoc DL = CanonicalIVIncrement->getDebugLoc();

  // We can't use StartV directly in the ActiveLaneMask VPInstruction, since

  // we have to take unrolling into account. Each part needs to start at

  //   Part * VF

  auto *VecPreheader = Plan.getVectorPreheader();

  VPBuilder Builder(VecPreheader);


  // Create the ActiveLaneMask instruction using the correct start values.

  VPValue *TC = Plan.getTripCount();


  VPValue *TripCount, *IncrementValue;

  if (!DataAndControlFlowWithoutRuntimeCheck) {

    // When the loop is guarded by a runtime overflow check for the loop

    // induction variable increment by VF, we can increment the value before

    // the get.active.lane mask and use the unmodified tripcount.

    IncrementValue = CanonicalIVIncrement;

    TripCount = TC;

  } else {

    // When avoiding a runtime check, the active.lane.mask inside the loop

    // uses a modified trip count and the induction variable increment is

    // done after the active.lane.mask intrinsic is called.

    IncrementValue = CanonicalIVPHI;

    TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,

                                     {TC}, DL);

  }

  auto *EntryIncrement = Builder.createOverflowingOp(

      VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,

      "index.part.next");


  // Create the active lane mask instruction in the VPlan preheader.

  auto *EntryALM =

      Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},

                           DL, "active.lane.mask.entry");


  // Now create the ActiveLaneMaskPhi recipe in the main loop using the

  // preheader ActiveLaneMask instruction.

  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());

  LaneMaskPhi->insertAfter(CanonicalIVPHI);


  // Create the active lane mask for the next iteration of the loop before the

  // original terminator.

  VPRecipeBase *OriginalTerminator = EB->getTerminator();

  Builder.setInsertPoint(OriginalTerminator);

  auto *InLoopIncrement =

      Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,

                                  {IncrementValue}, {false, false}, DL);

  auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,

                                   {InLoopIncrement, TripCount}, DL,

                                   "active.lane.mask.next");

  LaneMaskPhi->addOperand(ALM);


  // Replace the original terminator with BranchOnCond. We have to invert the

  // mask here because a true condition means jumping to the exit block.

  auto *NotMask = Builder.createNot(ALM, DL);

  Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);

  OriginalTerminator->eraseFromParent();

  return LaneMaskPhi;

}


/// Collect the header mask with the pattern:

///   (ICMP_ULE, WideCanonicalIV, backedge-taken-count)

/// TODO: Introduce explicit recipe for header-mask instead of searching

/// for the header-mask pattern manually.

static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {

  SmallVector<VPValue *> WideCanonicalIVs;

  auto *FoundWidenCanonicalIVUser =

      find_if(Plan.getCanonicalIV()->users(),

              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });

  assert(count_if(Plan.getCanonicalIV()->users(),

                  [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=

             1 &&

         "Must have at most one VPWideCanonicalIVRecipe");

  if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {

    auto *WideCanonicalIV =

        cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);

    WideCanonicalIVs.push_back(WideCanonicalIV);

  }


  // Also include VPWidenIntOrFpInductionRecipes that represent a widened

  // version of the canonical induction.

  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {

    auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);

    if (WidenOriginalIV && WidenOriginalIV->isCanonical())

      WideCanonicalIVs.push_back(WidenOriginalIV);

  }


  // Walk users of wide canonical IVs and find the single compare of the form

  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).

  VPSingleDefRecipe *HeaderMask = nullptr;

  for (auto *Wide : WideCanonicalIVs) {

    for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {

      auto *VPI = dyn_cast<VPInstruction>(U);

      if (!VPI || !vputils::isHeaderMask(VPI, Plan))

        continue;


      assert(VPI->getOperand(0) == Wide &&

             "WidenCanonicalIV must be the first operand of the compare");

      assert(!HeaderMask && "Multiple header masks found?");

      HeaderMask = VPI;

    }

  }

  return HeaderMask;

}


void VPlanTransforms::addActiveLaneMask(

    VPlan &Plan, bool UseActiveLaneMaskForControlFlow,

    bool DataAndControlFlowWithoutRuntimeCheck) {

  assert((!DataAndControlFlowWithoutRuntimeCheck ||

          UseActiveLaneMaskForControlFlow) &&

         "DataAndControlFlowWithoutRuntimeCheck implies "

         "UseActiveLaneMaskForControlFlow");


  auto *FoundWidenCanonicalIVUser =

      find_if(Plan.getCanonicalIV()->users(),

              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });

  assert(FoundWidenCanonicalIVUser &&

         "Must have widened canonical IV when tail folding!");

  VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);

  auto *WideCanonicalIV =

      cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);

  VPSingleDefRecipe *LaneMask;

  if (UseActiveLaneMaskForControlFlow) {

    LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(

        Plan, DataAndControlFlowWithoutRuntimeCheck);

  } else {

    VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);

    LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,

                              {WideCanonicalIV, Plan.getTripCount()}, nullptr,

                              "active.lane.mask");

  }


  // Walk users of WideCanonicalIV and replace the header mask of the form

  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,

  // removing the old one to ensure there is always only a single header mask.

  HeaderMask->replaceAllUsesWith(LaneMask);

  HeaderMask->eraseFromParent();

}


/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding

/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based

/// recipe could be created.

/// \p HeaderMask  Header Mask.

/// \p CurRecipe   Recipe to be transform.

/// \p TypeInfo    VPlan-based type analysis.

/// \p AllOneMask  The vector mask parameter of vector-predication intrinsics.

/// \p EVL         The explicit vector length parameter of vector-predication

/// intrinsics.

static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,

                                       VPRecipeBase &CurRecipe,

                                       VPTypeAnalysis &TypeInfo,

                                       VPValue &AllOneMask, VPValue &EVL) {

  // FIXME: Don't transform recipes to EVL recipes if they're not masked by the

  // header mask.

  auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {

    assert(OrigMask && "Unmasked recipe when folding tail");

    // HeaderMask will be handled using EVL.

    VPValue *Mask;

    if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))

      return Mask;

    return HeaderMask == OrigMask ? nullptr : OrigMask;

  };


  /// Adjust any end pointers so that they point to the end of EVL lanes not VF.

  auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {

    auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);

    if (!EndPtr)

      return Addr;

    assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&

           "VPVectorEndPointerRecipe with non-VF VF operand?");

    assert(

        all_of(EndPtr->users(),

               [](VPUser *U) {

                 return cast<VPWidenMemoryRecipe>(U)->isReverse();

               }) &&

        "VPVectorEndPointRecipe not used by reversed widened memory recipe?");

    VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();

    EVLAddr->insertBefore(&CurRecipe);

    EVLAddr->setOperand(1, &EVL);

    return EVLAddr;

  };


  return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)

      .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {

        VPValue *NewMask = GetNewMask(L->getMask());

        VPValue *NewAddr = GetNewAddr(L->getAddr());

        return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);

      })

      .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {

        VPValue *NewMask = GetNewMask(S->getMask());

        VPValue *NewAddr = GetNewAddr(S->getAddr());

        return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);

      })

      .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {

        VPValue *NewMask = GetNewMask(Red->getCondOp());

        return new VPReductionEVLRecipe(*Red, EVL, NewMask);

      })

      .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {

        VPValue *LHS, *RHS;

        // Transform select with a header mask condition

        //   select(header_mask, LHS, RHS)

        // into vector predication merge.

        //   vp.merge(all-true, LHS, RHS, EVL)

        if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),

                                 m_VPValue(RHS))))

          return nullptr;

        // Use all true as the condition because this transformation is

        // limited to selects whose condition is a header mask.

        return new VPWidenIntrinsicRecipe(

            Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},

            TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());

      })

      .Default([&](VPRecipeBase *R) { return nullptr; });

}


/// Replace recipes with their EVL variants.

static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {

  VPTypeAnalysis TypeInfo(Plan);

  VPValue *AllOneMask = Plan.getTrue();

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();


  assert(all_of(Plan.getVF().users(),

                IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,

                        VPWidenIntOrFpInductionRecipe>) &&

         "User of VF that we can't transform to EVL.");

  Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {

    return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);

  });


  assert(all_of(Plan.getVFxUF().users(),

                [&Plan](VPUser *U) {

                  return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),

                                          m_Specific(&Plan.getVFxUF()))) ||

                         isa<VPWidenPointerInductionRecipe>(U);

                }) &&

         "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "

         "increment of the canonical induction.");

  Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {

    // Only replace uses in VPWidenPointerInductionRecipe; The increment of the

    // canonical induction must not be updated.

    return isa<VPWidenPointerInductionRecipe>(U);

  });


  // Defer erasing recipes till the end so that we don't invalidate the

  // VPTypeAnalysis cache.

  SmallVector<VPRecipeBase *> ToErase;


  // Create a scalar phi to track the previous EVL if fixed-order recurrence is

  // contained.

  bool ContainsFORs =

      any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);

  if (ContainsFORs) {

    // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.

    VPValue *MaxEVL = &Plan.getVF();

    // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.

    VPBuilder Builder(LoopRegion->getPreheaderVPBB());

    MaxEVL = Builder.createScalarZExtOrTrunc(

        MaxEVL, Type::getInt32Ty(Plan.getContext()),

        TypeInfo.inferScalarType(MaxEVL), DebugLoc());


    Builder.setInsertPoint(Header, Header->getFirstNonPhi());

    VPValue *PrevEVL =

        Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");


    for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

             vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {

      for (VPRecipeBase &R : *VPBB) {

        VPValue *V1, *V2;

        if (!match(&R,

                   m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(

                       m_VPValue(V1), m_VPValue(V2))))

          continue;

        VPValue *Imm = Plan.getOrAddLiveIn(

            ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));

        VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(

            Intrinsic::experimental_vp_splice,

            {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},

            TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());

        VPSplice->insertBefore(&R);

        R.getVPSingleValue()->replaceAllUsesWith(VPSplice);

        ToErase.push_back(&R);

      }

    }

  }


  VPValue *HeaderMask = findHeaderMask(Plan);

  if (!HeaderMask)

    return;


  // Replace header masks with a mask equivalent to predicating by EVL:

  //

  // icmp ule widen-canonical-iv backedge-taken-count

  // ->

  // icmp ult step-vector, EVL

  VPRecipeBase *EVLR = EVL.getDefiningRecipe();

  VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));

  Type *EVLType = TypeInfo.inferScalarType(&EVL);

  VPValue *EVLMask = Builder.createICmp(

      CmpInst::ICMP_ULT,

      Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);

  HeaderMask->replaceAllUsesWith(EVLMask);

  ToErase.push_back(HeaderMask->getDefiningRecipe());


  // Try to optimize header mask recipes away to their EVL variants.

  // TODO: Split optimizeMaskToEVL out and move into

  // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in

  // tryToBuildVPlanWithVPRecipes beforehand.

  for (VPUser *U : collectUsersRecursively(EVLMask)) {

    auto *CurRecipe = cast<VPRecipeBase>(U);

    VPRecipeBase *EVLRecipe =

        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);

    if (!EVLRecipe)

      continue;


    [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();

    assert(NumDefVal == CurRecipe->getNumDefinedValues() &&

           "New recipe must define the same number of values as the "

           "original.");

    assert(NumDefVal <= 1 &&

           "Only supports recipes with a single definition or without users.");

    EVLRecipe->insertBefore(CurRecipe);

    if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {

      VPValue *CurVPV = CurRecipe->getVPSingleValue();

      CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());

    }

    ToErase.push_back(CurRecipe);

  }

  // Remove dead EVL mask.

  if (EVLMask->getNumUsers() == 0)

    ToErase.push_back(EVLMask->getDefiningRecipe());


  for (VPRecipeBase *R : reverse(ToErase)) {

    SmallVector<VPValue *> PossiblyDead(R->operands());

    R->eraseFromParent();

    for (VPValue *Op : PossiblyDead)

      recursivelyDeleteDeadRecipes(Op);

  }

}


/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and

/// replaces all uses except the canonical IV increment of

/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe

/// is used only for loop iterations counting after this transformation.

///

/// The function uses the following definitions:

///  %StartV is the canonical induction start value.

///

/// The function adds the following recipes:

///

/// vector.ph:

/// ...

///

/// vector.body:

/// ...

/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],

///                                               [ %NextEVLIV, %vector.body ]

/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]

/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL

/// ...

/// %OpEVL = cast i32 %VPEVL to IVSize

/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi

/// %NextAVL = sub IVSize nuw %AVL, %OpEVL

/// ...

///

/// If MaxSafeElements is provided, the function adds the following recipes:

/// vector.ph:

/// ...

///

/// vector.body:

/// ...

/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],

///                                               [ %NextEVLIV, %vector.body ]

/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]

/// %cmp = cmp ult %AVL, MaxSafeElements

/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements

/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL

/// ...

/// %OpEVL = cast i32 %VPEVL to IVSize

/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi

/// %NextAVL = sub IVSize nuw %AVL, %OpEVL

/// ...

///

void VPlanTransforms::addExplicitVectorLength(

    VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {

  VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();


  auto *CanonicalIVPHI = Plan.getCanonicalIV();

  auto *CanIVTy = CanonicalIVPHI->getScalarType();

  VPValue *StartV = CanonicalIVPHI->getStartValue();


  // Create the ExplicitVectorLengthPhi recipe in the main loop.

  auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());

  EVLPhi->insertAfter(CanonicalIVPHI);

  VPBuilder Builder(Header, Header->getFirstNonPhi());

  // Create the AVL (application vector length), starting from TC -> 0 in steps

  // of EVL.

  VPPhi *AVLPhi = Builder.createScalarPhi(

      {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");

  VPValue *AVL = AVLPhi;


  if (MaxSafeElements) {

    // Support for MaxSafeDist for correct loop emission.

    VPValue *AVLSafe =

        Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));

    VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);

    AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");

  }

  auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,

                                     DebugLoc());


  auto *CanonicalIVIncrement =

      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());

  Builder.setInsertPoint(CanonicalIVIncrement);

  VPValue *OpVPEVL = VPEVL;


  auto *I32Ty = Type::getInt32Ty(Plan.getContext());

  OpVPEVL = Builder.createScalarZExtOrTrunc(

      OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());


  auto *NextEVLIV = Builder.createOverflowingOp(

      Instruction::Add, {OpVPEVL, EVLPhi},

      {CanonicalIVIncrement->hasNoUnsignedWrap(),

       CanonicalIVIncrement->hasNoSignedWrap()},

      CanonicalIVIncrement->getDebugLoc(), "index.evl.next");

  EVLPhi->addOperand(NextEVLIV);


  VPValue *NextAVL = Builder.createOverflowingOp(

      Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},

      DebugLoc::getCompilerGenerated(), "avl.next");

  AVLPhi->addOperand(NextAVL);


  transformRecipestoEVLRecipes(Plan, *VPEVL);


  // Replace all uses of VPCanonicalIVPHIRecipe by

  // VPEVLBasedIVPHIRecipe except for the canonical IV increment.

  CanonicalIVPHI->replaceAllUsesWith(EVLPhi);

  CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);

  // TODO: support unroll factor > 1.

  Plan.setUF(1);

}


void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {

  // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.

  // There should be only one EVL PHI in the entire plan.

  VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;


  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_shallow(Plan.getEntry())))

    for (VPRecipeBase &R : VPBB->phis())

      if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {

        assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");

        EVLPhi = PhiR;

      }


  // Early return if no EVL PHI is found.

  if (!EVLPhi)

    return;


  VPBasicBlock *HeaderVPBB = EVLPhi->getParent();

  VPValue *EVLIncrement = EVLPhi->getBackedgeValue();

  VPValue *AVL;

  [[maybe_unused]] bool FoundAVL =

      match(EVLIncrement,

            m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));

  assert(FoundAVL && "Didn't find AVL?");


  // The AVL may be capped to a safe distance.

  VPValue *SafeAVL;

  if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))

    AVL = SafeAVL;


  VPValue *AVLNext;

  [[maybe_unused]] bool FoundAVLNext =

      match(AVL, m_VPInstruction<Instruction::PHI>(

                     m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));

  assert(FoundAVLNext && "Didn't find AVL backedge?");


  // Convert EVLPhi to concrete recipe.

  auto *ScalarR =

      VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},

                                        EVLPhi->getDebugLoc(), "evl.based.iv");

  EVLPhi->replaceAllUsesWith(ScalarR);

  EVLPhi->eraseFromParent();


  // Replace CanonicalIVInc with EVL-PHI increment.

  auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());

  VPValue *Backedge = CanonicalIV->getIncomingValue(1);

  assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),

                                 m_Specific(&Plan.getVFxUF()))) &&

         "Unexpected canonical iv");

  Backedge->replaceAllUsesWith(EVLIncrement);


  // Remove unused phi and increment.

  VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();

  CanonicalIVIncrement->eraseFromParent();

  CanonicalIV->eraseFromParent();


  // Replace the use of VectorTripCount in the latch-exiting block.

  // Before: (branch-on-count EVLIVInc, VectorTripCount)

  // After: (branch-on-cond eq AVLNext, 0)


  VPBasicBlock *LatchExiting =

      HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();

  auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());

  // Skip single-iteration loop region

  if (match(LatchExitingBr, m_BranchOnCond(m_True())))

    return;

  assert(LatchExitingBr &&

         match(LatchExitingBr,

               m_BranchOnCount(m_VPValue(EVLIncrement),

                               m_Specific(&Plan.getVectorTripCount()))) &&

         "Unexpected terminator in EVL loop");


  Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);

  VPBuilder Builder(LatchExitingBr);

  VPValue *Cmp =

      Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,

                         Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));

  Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);

  LatchExitingBr->eraseFromParent();

}


void VPlanTransforms::dropPoisonGeneratingRecipes(

    VPlan &Plan,

    const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {

  // Collect recipes in the backward slice of `Root` that may generate a poison

  // value that is used after vectorization.

  SmallPtrSet<VPRecipeBase *, 16> Visited;

  auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {

    SmallVector<VPRecipeBase *, 16> Worklist;

    Worklist.push_back(Root);


    // Traverse the backward slice of Root through its use-def chain.

    while (!Worklist.empty()) {

      VPRecipeBase *CurRec = Worklist.pop_back_val();


      if (!Visited.insert(CurRec).second)

        continue;


      // Prune search if we find another recipe generating a widen memory

      // instruction. Widen memory instructions involved in address computation

      // will lead to gather/scatter instructions, which don't need to be

      // handled.

      if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,

              VPHeaderPHIRecipe>(CurRec))

        continue;


      // This recipe contributes to the address computation of a widen

      // load/store. If the underlying instruction has poison-generating flags,

      // drop them directly.

      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {

        VPValue *A, *B;

        // Dropping disjoint from an OR may yield incorrect results, as some

        // analysis may have converted it to an Add implicitly (e.g. SCEV used

        // for dependence analysis). Instead, replace it with an equivalent Add.

        // This is possible as all users of the disjoint OR only access lanes

        // where the operands are disjoint or poison otherwise.

        if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&

            RecWithFlags->isDisjoint()) {

          VPBuilder Builder(RecWithFlags);

          VPInstruction *New = Builder.createOverflowingOp(

              Instruction::Add, {A, B}, {false, false},

              RecWithFlags->getDebugLoc());

          New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());

          RecWithFlags->replaceAllUsesWith(New);

          RecWithFlags->eraseFromParent();

          CurRec = New;

        } else

          RecWithFlags->dropPoisonGeneratingFlags();

      } else {

        Instruction *Instr = dyn_cast_or_null<Instruction>(

            CurRec->getVPSingleValue()->getUnderlyingValue());

        (void)Instr;

        assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&

               "found instruction with poison generating flags not covered by "

               "VPRecipeWithIRFlags");

      }


      // Add new definitions to the worklist.

      for (VPValue *Operand : CurRec->operands())

        if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())

          Worklist.push_back(OpDef);

    }

  });


  // Traverse all the recipes in the VPlan and collect the poison-generating

  // recipes in the backward slice starting at the address of a VPWidenRecipe or

  // VPInterleaveRecipe.

  auto Iter = vp_depth_first_deep(Plan.getEntry());

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

    for (VPRecipeBase &Recipe : *VPBB) {

      if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {

        Instruction &UnderlyingInstr = WidenRec->getIngredient();

        VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();

        if (AddrDef && WidenRec->isConsecutive() &&

            BlockNeedsPredication(UnderlyingInstr.getParent()))

          CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

      } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {

        VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();

        if (AddrDef) {

          // Check if any member of the interleave group needs predication.

          const InterleaveGroup<Instruction> *InterGroup =

              InterleaveRec->getInterleaveGroup();

          bool NeedPredication = false;

          for (int I = 0, NumMembers = InterGroup->getNumMembers();

               I < NumMembers; ++I) {

            Instruction *Member = InterGroup->getMember(I);

            if (Member)

              NeedPredication |= BlockNeedsPredication(Member->getParent());

          }


          if (NeedPredication)

            CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);

        }

      }

    }

  }

}


void VPlanTransforms::createInterleaveGroups(

    VPlan &Plan,

    const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>

        &InterleaveGroups,

    VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {

  if (InterleaveGroups.empty())

    return;


  // Interleave memory: for each Interleave Group we marked earlier as relevant

  // for this VPlan, replace the Recipes widening its memory instructions with a

  // single VPInterleaveRecipe at its insertion point.

  VPDominatorTree VPDT;

  VPDT.recalculate(Plan);

  for (const auto *IG : InterleaveGroups) {

    auto *Start =

        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));

    VPIRMetadata InterleaveMD(*Start);

    SmallVector<VPValue *, 4> StoredValues;

    if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))

      StoredValues.push_back(StoreR->getStoredValue());

    for (unsigned I = 1; I < IG->getFactor(); ++I) {

      Instruction *MemberI = IG->getMember(I);

      if (!MemberI)

        continue;

      VPWidenMemoryRecipe *MemoryR =

          cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));

      if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))

        StoredValues.push_back(StoreR->getStoredValue());

      InterleaveMD.intersect(*MemoryR);

    }


    bool NeedsMaskForGaps =

        (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||

        (!StoredValues.empty() && !IG->isFull());


    Instruction *IRInsertPos = IG->getInsertPos();

    auto *InsertPos =

        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));


    GEPNoWrapFlags NW = GEPNoWrapFlags::none();

    if (auto *Gep = dyn_cast<GetElementPtrInst>(

            getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))

      NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();


    // Get or create the start address for the interleave group.

    VPValue *Addr = Start->getAddr();

    VPRecipeBase *AddrDef = Addr->getDefiningRecipe();

    if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {

      // We cannot re-use the address of member zero because it does not

      // dominate the insert position. Instead, use the address of the insert

      // position and create a PtrAdd adjusting it to the address of member

      // zero.

      // TODO: Hoist Addr's defining recipe (and any operands as needed) to

      // InsertPos or sink loads above zero members to join it.

      assert(IG->getIndex(IRInsertPos) != 0 &&

             "index of insert position shouldn't be zero");

      auto &DL = IRInsertPos->getDataLayout();

      APInt Offset(32,

                   DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *

                       IG->getIndex(IRInsertPos),

                   /*IsSigned=*/true);

      VPValue *OffsetVPV =

          Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));

      VPBuilder B(InsertPos);

      Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);

    }

    // If the group is reverse, adjust the index to refer to the last vector

    // lane instead of the first. We adjust the index from the first vector

    // lane, rather than directly getting the pointer for lane VF - 1, because

    // the pointer operand of the interleaved access is supposed to be uniform.

    if (IG->isReverse()) {

      auto *ReversePtr = new VPVectorEndPointerRecipe(

          Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),

          -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());

      ReversePtr->insertBefore(InsertPos);

      Addr = ReversePtr;

    }

    auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,

                                        InsertPos->getMask(), NeedsMaskForGaps,

                                        InterleaveMD, InsertPos->getDebugLoc());

    VPIG->insertBefore(InsertPos);


    unsigned J = 0;

    for (unsigned i = 0; i < IG->getFactor(); ++i)

      if (Instruction *Member = IG->getMember(i)) {

        VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);

        if (!Member->getType()->isVoidTy()) {

          VPValue *OriginalV = MemberR->getVPSingleValue();

          OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));

          J++;

        }

        MemberR->eraseFromParent();

      }

  }

}


/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial

/// value, phi and backedge value. In the following example:

///

///  vector.ph:

///  Successor(s): vector loop

///

///  <x1> vector loop: {

///    vector.body:

///      WIDEN-INDUCTION %i = phi %start, %step, %vf

///      ...

///      EMIT branch-on-count ...

///    No successors

///  }

///

/// WIDEN-INDUCTION will get expanded to:

///

///  vector.ph:

///    ...

///    vp<%induction.start> = ...

///    vp<%induction.increment> = ...

///

///  Successor(s): vector loop

///

///  <x1> vector loop: {

///    vector.body:

///      ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>

///      ...

///      vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>

///      EMIT branch-on-count ...

///    No successors

///  }

static void

expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,

                              VPTypeAnalysis &TypeInfo) {

  VPlan *Plan = WidenIVR->getParent()->getPlan();

  VPValue *Start = WidenIVR->getStartValue();

  VPValue *Step = WidenIVR->getStepValue();

  VPValue *VF = WidenIVR->getVFValue();

  DebugLoc DL = WidenIVR->getDebugLoc();


  // The value from the original loop to which we are mapping the new induction

  // variable.

  Type *Ty = TypeInfo.inferScalarType(WidenIVR);


  const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();

  Instruction::BinaryOps AddOp;

  Instruction::BinaryOps MulOp;

  // FIXME: The newly created binary instructions should contain nsw/nuw

  // flags, which can be found from the original scalar operations.

  VPIRFlags Flags;

  if (ID.getKind() == InductionDescriptor::IK_IntInduction) {

    AddOp = Instruction::Add;

    MulOp = Instruction::Mul;

  } else {

    AddOp = ID.getInductionOpcode();

    MulOp = Instruction::FMul;

    Flags = ID.getInductionBinOp()->getFastMathFlags();

  }


  // If the phi is truncated, truncate the start and step values.

  VPBuilder Builder(Plan->getVectorPreheader());

  Type *StepTy = TypeInfo.inferScalarType(Step);

  if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {

    assert(StepTy->isIntegerTy() && "Truncation requires an integer type");

    Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);

    Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);

    StepTy = Ty;

  }


  // Construct the initial value of the vector IV in the vector loop preheader.

  Type *IVIntTy =

      IntegerType::get(Plan->getContext(), StepTy->getScalarSizeInBits());

  VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);

  if (StepTy->isFloatingPointTy())

    Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);


  VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);

  VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);


  Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);

  Init =

      Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");


  // Create the widened phi of the vector IV.

  auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,

                                       WidenIVR->getDebugLoc(), "vec.ind");

  WidePHI->addOperand(Init);

  WidePHI->insertBefore(WidenIVR);


  // Create the backedge value for the vector IV.

  VPValue *Inc;

  VPValue *Prev;

  // If unrolled, use the increment and prev value from the operands.

  if (auto *SplatVF = WidenIVR->getSplatVFValue()) {

    Inc = SplatVF;

    Prev = WidenIVR->getLastUnrolledPartOperand();

  } else {

    if (VPRecipeBase *R = VF->getDefiningRecipe())

      Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));

    // Multiply the vectorization factor by the step using integer or

    // floating-point arithmetic as appropriate.

    if (StepTy->isFloatingPointTy())

      VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,

                                    DL);

    else

      VF = Builder.createScalarZExtOrTrunc(VF, StepTy,

                                           TypeInfo.inferScalarType(VF), DL);


    Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);

    Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);

    Prev = WidePHI;

  }


  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();

  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());

  auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,

                                    WidenIVR->getDebugLoc(), "vec.ind.next");


  WidePHI->addOperand(Next);


  WidenIVR->replaceAllUsesWith(WidePHI);

}


/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the

/// initial value, phi and backedge value. In the following example:

///

///  <x1> vector loop: {

///    vector.body:

///      EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf

///      ...

///      EMIT branch-on-count ...

///  }

///

/// WIDEN-POINTER-INDUCTION will get expanded to:

///

///  <x1> vector loop: {

///    vector.body:

///      EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind

///      EMIT %mul = mul %stepvector, %step

///      EMIT %vector.gep = wide-ptradd %pointer.phi, %mul

///      ...

///      EMIT %ptr.ind = ptradd %pointer.phi, %vf

///      EMIT branch-on-count ...

///  }

static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,

                                          VPTypeAnalysis &TypeInfo) {

  VPlan *Plan = R->getParent()->getPlan();

  VPValue *Start = R->getStartValue();

  VPValue *Step = R->getStepValue();

  VPValue *VF = R->getVFValue();


  assert(R->getInductionDescriptor().getKind() ==

             InductionDescriptor::IK_PtrInduction &&

         "Not a pointer induction according to InductionDescriptor!");

  assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");

  assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&

         "Recipe should have been replaced");


  VPBuilder Builder(R);

  DebugLoc DL = R->getDebugLoc();


  // Build a scalar pointer phi.

  VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");


  // Create actual address geps that use the pointer phi as base and a

  // vectorized version of the step value (<step*0, ..., step*N>) as offset.

  Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());

  Type *StepTy = TypeInfo.inferScalarType(Step);

  VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);

  Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});

  VPValue *PtrAdd = Builder.createNaryOp(

      VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");

  R->replaceAllUsesWith(PtrAdd);


  // Create the backedge value for the scalar pointer phi.

  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();

  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());

  VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),

                                       DL);

  VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});


  VPValue *InductionGEP =

      Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");

  ScalarPtrPhi->addOperand(InductionGEP);

}


void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {

  // Replace loop regions with explicity CFG.

  SmallVector<VPRegionBlock *> LoopRegions;

  for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    if (!R->isReplicator())

      LoopRegions.push_back(R);

  }

  for (VPRegionBlock *R : LoopRegions)

    R->dissolveToCFGLoop();

}


void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {

  VPTypeAnalysis TypeInfo(Plan);

  SmallVector<VPRecipeBase *> ToRemove;

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getEntry()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {

        expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);

        ToRemove.push_back(WidenIVR);

        continue;

      }


      if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {

        expandVPWidenPointerInduction(WidenIVR, TypeInfo);

        ToRemove.push_back(WidenIVR);

        continue;

      }


      // Expand VPBlendRecipe into VPInstruction::Select.

      VPBuilder Builder(&R);

      if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {

        VPValue *Select = Blend->getIncomingValue(0);

        for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)

          Select = Builder.createSelect(Blend->getMask(I),

                                        Blend->getIncomingValue(I), Select,

                                        R.getDebugLoc(), "predphi");

        Blend->replaceAllUsesWith(Select);

        ToRemove.push_back(Blend);

      }


      if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {

        Expr->decompose();

        ToRemove.push_back(Expr);

      }


      VPValue *VectorStep;

      VPValue *ScalarStep;

      if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(

                         m_VPValue(VectorStep), m_VPValue(ScalarStep))))

        continue;


      // Expand WideIVStep.

      auto *VPI = cast<VPInstruction>(&R);

      Type *IVTy = TypeInfo.inferScalarType(VPI);

      if (TypeInfo.inferScalarType(VectorStep) != IVTy) {

        Instruction::CastOps CastOp = IVTy->isFloatingPointTy()

                                          ? Instruction::UIToFP

                                          : Instruction::Trunc;

        VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);

      }


      [[maybe_unused]] auto *ConstStep =

          ScalarStep->isLiveIn()

              ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())

              : nullptr;

      assert(!ConstStep || ConstStep->getValue() != 1);

      (void)ConstStep;

      if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {

        ScalarStep =

            Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);

      }


      VPIRFlags Flags;

      if (IVTy->isFloatingPointTy())

        Flags = {VPI->getFastMathFlags()};


      unsigned MulOpc =

          IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;

      VPInstruction *Mul = Builder.createNaryOp(

          MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());

      VectorStep = Mul;

      VPI->replaceAllUsesWith(VectorStep);

      ToRemove.push_back(VPI);

    }

  }


  for (VPRecipeBase *R : ToRemove)

    R->eraseFromParent();

}


void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,

                                                 VPBasicBlock *EarlyExitVPBB,

                                                 VPlan &Plan,

                                                 VPBasicBlock *HeaderVPBB,

                                                 VPBasicBlock *LatchVPBB) {

  VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];

  if (!EarlyExitVPBB->getSinglePredecessor() &&

      EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {

    assert(EarlyExitVPBB->getNumPredecessors() == 2 &&

           EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&

           "unsupported early exit VPBB");

    // Early exit operand should always be last phi operand. If EarlyExitVPBB

    // has two predecessors and EarlyExitingVPBB is the first, swap the operands

    // of the phis.

    for (VPRecipeBase &R : EarlyExitVPBB->phis())

      cast<VPIRPhi>(&R)->swapOperands();

  }


  VPBuilder Builder(LatchVPBB->getTerminator());

  VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];

  assert(

      match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&

      "Terminator must be be BranchOnCond");

  VPValue *CondOfEarlyExitingVPBB =

      EarlyExitingVPBB->getTerminator()->getOperand(0);

  auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB

                              ? CondOfEarlyExitingVPBB

                              : Builder.createNot(CondOfEarlyExitingVPBB);


  // Split the middle block and have it conditionally branch to the early exit

  // block if CondToEarlyExit.

  VPValue *IsEarlyExitTaken =

      Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});

  VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");

  VPBasicBlock *VectorEarlyExitVPBB =

      Plan.createVPBasicBlock("vector.early.exit");

  VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);

  VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);

  NewMiddle->swapSuccessors();


  VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);


  // Update the exit phis in the early exit block.

  VPBuilder MiddleBuilder(NewMiddle);

  VPBuilder EarlyExitB(VectorEarlyExitVPBB);

  for (VPRecipeBase &R : EarlyExitVPBB->phis()) {

    auto *ExitIRI = cast<VPIRPhi>(&R);

    // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has

    // a single predecessor and 1 if it has two.

    unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;

    if (ExitIRI->getNumOperands() != 1) {

      // The first of two operands corresponds to the latch exit, via MiddleVPBB

      // predecessor. Extract its last lane.

      ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);

    }


    VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);

    if (!IncomingFromEarlyExit->isLiveIn()) {

      // Update the incoming value from the early exit.

      VPValue *FirstActiveLane = EarlyExitB.createNaryOp(

          VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,

          "first.active.lane");

      IncomingFromEarlyExit = EarlyExitB.createNaryOp(

          VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},

          nullptr, "early.exit.value");

      ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);

    }

  }

  MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});


  // Replace the condition controlling the non-early exit from the vector loop

  // with one exiting if either the original condition of the vector latch is

  // true or the early exit has been taken.

  auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());

  assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&

         "Unexpected terminator");

  auto *IsLatchExitTaken =

      Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),

                         LatchExitingBranch->getOperand(1));

  auto *AnyExitTaken = Builder.createNaryOp(

      Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});

  Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);

  LatchExitingBranch->eraseFromParent();

}


/// This function tries convert extended in-loop reductions to

/// VPExpressionRecipe and clamp the \p Range if it is beneficial and

/// valid. The created recipe must be decomposed to its constituent

/// recipes before execution.

static VPExpressionRecipe *

tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,

                                     VFRange &Range) {

  Type *RedTy = Ctx.Types.inferScalarType(Red);

  VPValue *VecOp = Red->getVecOp();


  // Clamp the range if using extended-reduction is profitable.

  auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,

                                             Type *SrcTy) -> bool {

    return LoopVectorizationPlanner::getDecisionAndClampRange(

        [&](ElementCount VF) {

          auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));

          TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

          InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(

              Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),

              CostKind);

          InstructionCost ExtCost =

              cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);

          InstructionCost RedCost = Red->computeCost(VF, Ctx);

          return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;

        },

        Range);

  };


  VPValue *A;

  // Match reduce(ext)).

  if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&

      IsExtendedRedValidAndClampRange(

          RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),

          cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==

              Instruction::CastOps::ZExt,

          Ctx.Types.inferScalarType(A)))

    return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);


  return nullptr;

}


/// This function tries convert extended in-loop reductions to

/// VPExpressionRecipe and clamp the \p Range if it is beneficial

/// and valid. The created VPExpressionRecipe must be decomposed to its

/// constituent recipes before execution. Patterns of the

/// VPExpressionRecipe:

///   reduce.add(mul(...)),

///   reduce.add(mul(ext(A), ext(B))),

///   reduce.add(ext(mul(ext(A), ext(B)))).

static VPExpressionRecipe *

tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,

                                          VPCostContext &Ctx, VFRange &Range) {

  unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());

  if (Opcode != Instruction::Add)

    return nullptr;


  Type *RedTy = Ctx.Types.inferScalarType(Red);


  // Clamp the range if using multiply-accumulate-reduction is profitable.

  auto IsMulAccValidAndClampRange =

      [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,

          VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {

    return LoopVectorizationPlanner::getDecisionAndClampRange(

        [&](ElementCount VF) {

          TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

          Type *SrcTy =

              Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;

          auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));

          InstructionCost MulAccCost =

              Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);

          InstructionCost MulCost = Mul->computeCost(VF, Ctx);

          InstructionCost RedCost = Red->computeCost(VF, Ctx);

          InstructionCost ExtCost = 0;

          if (Ext0)

            ExtCost += Ext0->computeCost(VF, Ctx);

          if (Ext1)

            ExtCost += Ext1->computeCost(VF, Ctx);

          if (OuterExt)

            ExtCost += OuterExt->computeCost(VF, Ctx);


          return MulAccCost.isValid() &&

                 MulAccCost < ExtCost + MulCost + RedCost;

        },

        Range);

  };


  VPValue *VecOp = Red->getVecOp();

  VPValue *A, *B;

  // Try to match reduce.add(mul(...)).

  if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {

    auto *RecipeA =

        dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());

    auto *RecipeB =

        dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());

    auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());


    // Match reduce.add(mul(ext, ext)).

    if (RecipeA && RecipeB &&

        (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&

        match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&

        match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&

        IsMulAccValidAndClampRange(RecipeA->getOpcode() ==

                                       Instruction::CastOps::ZExt,

                                   Mul, RecipeA, RecipeB, nullptr)) {

      return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);

    }

    // Match reduce.add(mul).

    if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))

      return new VPExpressionRecipe(Mul, Red);

  }

  // Match reduce.add(ext(mul(ext(A), ext(B)))).

  // All extend recipes must have same opcode or A == B

  // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).

  if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),

                                      m_ZExtOrSExt(m_VPValue()))))) {

    auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());

    auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());

    auto *Ext0 =

        cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());

    auto *Ext1 =

        cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());

    if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&

        Ext0->getOpcode() == Ext1->getOpcode() &&

        IsMulAccValidAndClampRange(Ext0->getOpcode() ==

                                       Instruction::CastOps::ZExt,

                                   Mul, Ext0, Ext1, Ext)) {

      auto *NewExt0 = new VPWidenCastRecipe(

          Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,

          Ext0->getDebugLoc());

      NewExt0->insertBefore(Ext0);


      VPWidenCastRecipe *NewExt1 = NewExt0;

      if (Ext0 != Ext1) {

        NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),

                                        Ext->getResultType(), *Ext1,

                                        Ext1->getDebugLoc());

        NewExt1->insertBefore(Ext1);

      }

      Mul->setOperand(0, NewExt0);

      Mul->setOperand(1, NewExt1);

      Red->setOperand(1, Mul);

      return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);

    }

  }

  return nullptr;

}


/// This function tries to create abstract recipes from the reduction recipe for

/// following optimizations and cost estimation.

static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,

                                               VPCostContext &Ctx,

                                               VFRange &Range) {

  VPExpressionRecipe *AbstractR = nullptr;

  auto IP = std::next(Red->getIterator());

  auto *VPBB = Red->getParent();

  if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))

    AbstractR = MulAcc;

  else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))

    AbstractR = ExtRed;

  // Cannot create abstract inloop reduction recipes.

  if (!AbstractR)

    return;


  AbstractR->insertBefore(*VPBB, IP);

  Red->replaceAllUsesWith(AbstractR);

}


void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,

                                               VFRange &Range) {

  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

           vp_depth_first_deep(Plan.getVectorLoopRegion()))) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      if (auto *Red = dyn_cast<VPReductionRecipe>(&R))

        tryToCreateAbstractReductionRecipe(Red, Ctx, Range);

    }

  }

}


void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {

  if (Plan.hasScalarVFOnly())

    return;


#ifndef NDEBUG

  VPDominatorTree VPDT;

  VPDT.recalculate(Plan);

#endif


  SmallVector<VPValue *> VPValues;

  if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0)

    VPValues.push_back(Plan.getOrCreateBackedgeTakenCount());

  append_range(VPValues, Plan.getLiveIns());

  for (VPRecipeBase &R : *Plan.getEntry())

    append_range(VPValues, R.definedValues());


  auto *VectorPreheader = Plan.getVectorPreheader();

  for (VPValue *VPV : VPValues) {

    if (vputils::onlyScalarValuesUsed(VPV) ||

        (VPV->isLiveIn() && VPV->getLiveInIRValue() &&

         isa<Constant>(VPV->getLiveInIRValue())))

      continue;


    // Add explicit broadcast at the insert point that dominates all users.

    VPBasicBlock *HoistBlock = VectorPreheader;

    VPBasicBlock::iterator HoistPoint = VectorPreheader->end();

    for (VPUser *User : VPV->users()) {

      if (User->usesScalars(VPV))

        continue;

      if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)

        HoistPoint = HoistBlock->begin();

      else

        assert(VPDT.dominates(VectorPreheader,

                              cast<VPRecipeBase>(User)->getParent()) &&

               "All users must be in the vector preheader or dominated by it");

    }


    VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);

    auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});

    VPV->replaceUsesWithIf(Broadcast,

                           [VPV, Broadcast](VPUser &U, unsigned Idx) {

                             return Broadcast != &U && !U.usesScalars(VPV);

                           });

  }

}


void VPlanTransforms::materializeConstantVectorTripCount(

    VPlan &Plan, ElementCount BestVF, unsigned BestUF,

    PredicatedScalarEvolution &PSE) {

  assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");

  assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");


  VPValue *TC = Plan.getTripCount();

  // Skip cases for which the trip count may be non-trivial to materialize.

  // I.e., when a scalar tail is absent - due to tail folding, or when a scalar

  // tail is required.

  if (!Plan.hasScalarTail() ||

      Plan.getMiddleBlock()->getSingleSuccessor() ==

          Plan.getScalarPreheader() ||

      !TC->isLiveIn())

    return;


  // Materialize vector trip counts for constants early if it can simply

  // be computed as (Original TC / VF * UF) * VF * UF.

  // TODO: Compute vector trip counts for loops requiring a scalar epilogue and

  // tail-folded loops.

  ScalarEvolution &SE = *PSE.getSE();

  auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());

  if (!isa<SCEVConstant>(TCScev))

    return;

  const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);

  auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);

  if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))

    Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());

}


void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,

                                                    VPBasicBlock *VectorPH) {

  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();

  if (BTC->getNumUsers() == 0)

    return;


  VPBuilder Builder(VectorPH, VectorPH->begin());

  auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());

  auto *TCMO = Builder.createNaryOp(

      Instruction::Sub,

      {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},

      DebugLoc::getCompilerGenerated(), "trip.count.minus.1");

  BTC->replaceAllUsesWith(TCMO);

}


void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {

  if (Plan.hasScalarVFOnly())

    return;


  VPTypeAnalysis TypeInfo(Plan);

  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();

  auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(

      vp_depth_first_shallow(Plan.getEntry()));

  auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(

      vp_depth_first_shallow(LoopRegion->getEntry()));

  // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,

  // excluding ones in replicate regions. Those are not materialized explicitly

  // yet. Those vector users are still handled in VPReplicateRegion::execute(),

  // via shouldPack().

  // TODO: materialize build vectors for replicating recipes in replicating

  // regions.

  // TODO: materialize build vectors for VPInstructions.

  for (VPBasicBlock *VPBB :

       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {

    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {

      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);

      auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {

        VPRegionBlock *ParentRegion =

            cast<VPRecipeBase>(U)->getParent()->getParent();

        return !U->usesScalars(RepR) || ParentRegion != LoopRegion;

      };

      if (!RepR || RepR->isSingleScalar() ||

          none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))

        continue;


      Type *ScalarTy = TypeInfo.inferScalarType(RepR);

      unsigned Opcode = ScalarTy->isStructTy()

                            ? VPInstruction::BuildStructVector

                            : VPInstruction::BuildVector;

      auto *BuildVector = new VPInstruction(Opcode, {RepR});

      BuildVector->insertAfter(RepR);


      RepR->replaceUsesWithIf(

          BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](

                           VPUser &U, unsigned) {

            return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);

          });

    }

  }

}


void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,

                                                 VPBasicBlock *VectorPHVPBB,

                                                 bool TailByMasking,

                                                 bool RequiresScalarEpilogue) {

  VPValue &VectorTC = Plan.getVectorTripCount();

  assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");

  // There's nothing to do if there are no users of the vector trip count or its

  // IR value has already been set.

  if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())

    return;


  VPValue *TC = Plan.getTripCount();

  Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);

  VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());

  VPValue *Step = &Plan.getVFxUF();


  // If the tail is to be folded by masking, round the number of iterations N

  // up to a multiple of Step instead of rounding down. This is done by first

  // adding Step-1 and then rounding down. Note that it's ok if this addition

  // overflows: the vector induction variable will eventually wrap to zero given

  // that it starts at zero and its Step is a power of two; the loop will then

  // exit, with the last early-exit vector comparison also producing all-true.

  // For scalable vectors the VF is not guaranteed to be a power of 2, but this

  // is accounted for in emitIterationCountCheck that adds an overflow check.

  if (TailByMasking) {

    TC = Builder.createNaryOp(

        Instruction::Add,

        {TC, Builder.createNaryOp(

                 Instruction::Sub,

                 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},

        DebugLoc::getCompilerGenerated(), "n.rnd.up");

  }


  // Now we need to generate the expression for the part of the loop that the

  // vectorized body will execute. This is equal to N - (N % Step) if scalar

  // iterations are not required for correctness, or N - Step, otherwise. Step

  // is equal to the vectorization factor (number of SIMD elements) times the

  // unroll factor (number of SIMD instructions).

  VPValue *R =

      Builder.createNaryOp(Instruction::URem, {TC, Step},

                           DebugLoc::getCompilerGenerated(), "n.mod.vf");


  // There are cases where we *must* run at least one iteration in the remainder

  // loop.  See the cost model for when this can happen.  If the step evenly

  // divides the trip count, we set the remainder to be equal to the step. If

  // the step does not evenly divide the trip count, no adjustment is necessary

  // since there will already be scalar iterations. Note that the minimum

  // iterations check ensures that N >= Step.

  if (RequiresScalarEpilogue) {

    assert(!TailByMasking &&

           "requiring scalar epilogue is not supported with fail folding");

    VPValue *IsZero = Builder.createICmp(

        CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));

    R = Builder.createSelect(IsZero, Step, R);

  }


  VPValue *Res = Builder.createNaryOp(

      Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");

  VectorTC.replaceAllUsesWith(Res);

}


void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,

                                            ElementCount VFEC) {

  VPBuilder Builder(VectorPH, VectorPH->begin());

  Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());

  VPValue &VF = Plan.getVF();

  VPValue &VFxUF = Plan.getVFxUF();

  // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be

  // used.

  // TODO: Assert that they aren't used.


  // If there are no users of the runtime VF, compute VFxUF by constant folding

  // the multiplication of VF and UF.

  if (VF.getNumUsers() == 0) {

    VPValue *RuntimeVFxUF =

        Builder.createElementCount(TCTy, VFEC * Plan.getUF());

    VFxUF.replaceAllUsesWith(RuntimeVFxUF);

    return;

  }


  // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *

  // vscale) * UF.

  VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);

  if (!vputils::onlyScalarValuesUsed(&VF)) {

    VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);

    VF.replaceUsesWithIf(

        BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });

  }

  VF.replaceAllUsesWith(RuntimeVF);


  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));

  VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});

  VFxUF.replaceAllUsesWith(MulByUF);

}


DenseMap<const SCEV *, Value *>

VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {

  const DataLayout &DL = SE.getDataLayout();

  SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);


  auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());

  BasicBlock *EntryBB = Entry->getIRBasicBlock();

  DenseMap<const SCEV *, Value *> ExpandedSCEVs;

  for (VPRecipeBase &R : make_early_inc_range(*Entry)) {

    if (isa<VPIRInstruction, VPIRPhi>(&R))

      continue;

    auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);

    if (!ExpSCEV)

      break;

    const SCEV *Expr = ExpSCEV->getSCEV();

    Value *Res =

        Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());

    ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;

    VPValue *Exp = Plan.getOrAddLiveIn(Res);

    ExpSCEV->replaceAllUsesWith(Exp);

    if (Plan.getTripCount() == ExpSCEV)

      Plan.resetTripCount(Exp);

    ExpSCEV->eraseFromParent();

  }

  assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&

         "VPExpandSCEVRecipes must be at the beginning of the entry block, "

         "after any VPIRInstructions");

  // Add IR instructions in the entry basic block but not in the VPIRBasicBlock

  // to the VPIRBasicBlock.

  auto EI = Entry->begin();

  for (Instruction &I : drop_end(*EntryBB)) {

    if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&

        &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {

      EI++;

      continue;

    }

    VPIRInstruction::create(I)->insertBefore(*Entry, EI);

  }


  return ExpandedSCEVs;

}


/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be

/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a

/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding

/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to

/// an index-independent load if it feeds all wide ops at all indices (\p OpV

/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p

/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V

/// is defined at \p Idx of a load interleave group.

static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,

                          VPValue *OpV, unsigned Idx) {

  auto *DefR = OpV->getDefiningRecipe();

  if (!DefR)

    return WideMember0->getOperand(OpIdx) == OpV;

  if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))

    return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;


  if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))

    return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;

  return false;

}


/// Returns true if \p IR is a full interleave group with factor and number of

/// members both equal to \p VF. The interleave group must also access the full

/// vector width \p VectorRegWidth.

static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,

                                         unsigned VF, VPTypeAnalysis &TypeInfo,

                                         unsigned VectorRegWidth) {

  if (!InterleaveR)

    return false;


  Type *GroupElementTy = nullptr;

  if (InterleaveR->getStoredValues().empty()) {

    GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));

    if (!all_of(InterleaveR->definedValues(),

                [&TypeInfo, GroupElementTy](VPValue *Op) {

                  return TypeInfo.inferScalarType(Op) == GroupElementTy;

                }))

      return false;

  } else {

    GroupElementTy =

        TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);

    if (!all_of(InterleaveR->getStoredValues(),

                [&TypeInfo, GroupElementTy](VPValue *Op) {

                  return TypeInfo.inferScalarType(Op) == GroupElementTy;

                }))

      return false;

  }


  unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;

  auto IG = InterleaveR->getInterleaveGroup();

  return IG->getFactor() == VF && IG->getNumMembers() == VF &&

         GroupSize == VectorRegWidth;

}


/// Returns true if \p VPValue is a narrow VPValue.

static bool isAlreadyNarrow(VPValue *VPV) {

  if (VPV->isLiveIn())

    return true;

  auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);

  return RepR && RepR->isSingleScalar();

}


void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,

                                             unsigned VectorRegWidth) {

  VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();

  if (VF.isScalable() || !VectorLoop)

    return;


  VPTypeAnalysis TypeInfo(Plan);


  unsigned FixedVF = VF.getFixedValue();

  SmallVector<VPInterleaveRecipe *> StoreGroups;

  for (auto &R : *VectorLoop->getEntryBasicBlock()) {

    if (isa<VPCanonicalIVPHIRecipe>(&R) ||

        match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))

      continue;


    if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&

        vputils::onlyFirstLaneUsed(cast<VPSingleDefRecipe>(&R)))

      continue;


    // Bail out on recipes not supported at the moment:

    //  * phi recipes other than the canonical induction

    //  * recipes writing to memory except interleave groups

    // Only support plans with a canonical induction phi.

    if (R.isPhi())

      return;


    auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);

    if (R.mayWriteToMemory() && !InterleaveR)

      return;


    // All other ops are allowed, but we reject uses that cannot be converted

    // when checking all allowed consumers (store interleave groups) below.

    if (!InterleaveR)

      continue;


    // Bail out on non-consecutive interleave groups.

    if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,

                                      VectorRegWidth))

      return;


    // Skip read interleave groups.

    if (InterleaveR->getStoredValues().empty())

      continue;


    // Narrow interleave groups, if all operands are already matching narrow

    // ops.

    auto *Member0 = InterleaveR->getStoredValues()[0];

    if (isAlreadyNarrow(Member0) &&

        all_of(InterleaveR->getStoredValues(),

               [Member0](VPValue *VPV) { return Member0 == VPV; })) {

      StoreGroups.push_back(InterleaveR);

      continue;

    }


    // For now, we only support full interleave groups storing load interleave

    // groups.

    if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {

          VPRecipeBase *DefR = Op.value()->getDefiningRecipe();

          if (!DefR)

            return false;

          auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);

          return IR && IR->getInterleaveGroup()->isFull() &&

                 IR->getVPValue(Op.index()) == Op.value();

        })) {

      StoreGroups.push_back(InterleaveR);

      continue;

    }


    // Check if all values feeding InterleaveR are matching wide recipes, which

    // operands that can be narrowed.

    auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(

        InterleaveR->getStoredValues()[0]->getDefiningRecipe());

    if (!WideMember0)

      return;

    for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {

      auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());

      if (!R || R->getOpcode() != WideMember0->getOpcode() ||

          R->getNumOperands() > 2)

        return;

      if (any_of(enumerate(R->operands()),

                 [WideMember0, Idx = I](const auto &P) {

                   const auto &[OpIdx, OpV] = P;

                   return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);

                 }))

        return;

    }

    StoreGroups.push_back(InterleaveR);

  }


  if (StoreGroups.empty())

    return;


  // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.

  auto NarrowOp = [](VPValue *V) -> VPValue * {

    auto *R = V->getDefiningRecipe();

    if (!R)

      return V;

    if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {

      // Narrow interleave group to wide load, as transformed VPlan will only

      // process one original iteration.

      auto *L = new VPWidenLoadRecipe(

          *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),

          LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,

          /*Reverse=*/false, {}, LoadGroup->getDebugLoc());

      L->insertBefore(LoadGroup);

      return L;

    }


    if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {

      assert(RepR->isSingleScalar() &&

             isa<LoadInst>(RepR->getUnderlyingInstr()) &&

             "must be a single scalar load");

      return RepR;

    }

    auto *WideLoad = cast<VPWidenLoadRecipe>(R);


    VPValue *PtrOp = WideLoad->getAddr();

    if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))

      PtrOp = VecPtr->getOperand(0);

    // Narrow wide load to uniform scalar load, as transformed VPlan will only

    // process one original iteration.

    auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},

                                    /*IsUniform*/ true,

                                    /*Mask*/ nullptr, *WideLoad);

    N->insertBefore(WideLoad);

    return N;

  };


  // Narrow operation tree rooted at store groups.

  for (auto *StoreGroup : StoreGroups) {

    VPValue *Res = nullptr;

    VPValue *Member0 = StoreGroup->getStoredValues()[0];

    if (isAlreadyNarrow(Member0)) {

      Res = Member0;

    } else if (auto *WideMember0 =

                   dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {

      for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)

        WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));

      Res = WideMember0;

    } else {

      Res = NarrowOp(Member0);

    }


    auto *S = new VPWidenStoreRecipe(

        *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),

        StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,

        /*Reverse=*/false, {}, StoreGroup->getDebugLoc());

    S->insertBefore(StoreGroup);

    StoreGroup->eraseFromParent();

  }


  // Adjust induction to reflect that the transformed plan only processes one

  // original iteration.

  auto *CanIV = Plan.getCanonicalIV();

  auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());

  Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(

                         CanIV->getScalarType(), 1 * Plan.getUF())));

  Plan.getVF().replaceAllUsesWith(

      Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));

  removeDeadRecipes(Plan);

  assert(none_of(*VectorLoop->getEntryBasicBlock(),

                 IsaPred<VPVectorPointerRecipe>) &&

         "All VPVectorPointerRecipes should have been removed");

}


/// Add branch weight metadata, if the \p Plan's middle block is terminated by a

/// BranchOnCond recipe.

void VPlanTransforms::addBranchWeightToMiddleTerminator(

    VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {

  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();

  auto *MiddleTerm =

      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());

  // Only add branch metadata if there is a (conditional) terminator.

  if (!MiddleTerm)

    return;


  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&

         "must have a BranchOnCond");

  // Assume that `TripCount % VectorStep ` is equally distributed.

  unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();

  if (VF.isScalable() && VScaleForTuning.has_value())

    VectorStep *= *VScaleForTuning;

  assert(VectorStep > 0 && "trip count should not be zero");

  MDBuilder MDB(Plan.getContext());

  MDNode *BranchWeights =

      MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);

  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);

}

for
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
Definition: AArch64ExpandPseudoInsts.cpp:114

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Select
AMDGPU Register Bank Select
Definition: AMDGPURegBankSelect.cpp:68

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

ToRemove
ReachingDefAnalysis InstSet & ToRemove
Definition: ARMLowOverheadLoops.cpp:529

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:882

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

Casting.h

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition: DeadArgumentElimination.cpp:347

Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:79

X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:164

IVDescriptors.h

Users
iv Induction Variable Users
Definition: IVUsers.cpp:48

InstSimplifyFolder.h

Intrinsics.h

licm
licm
Definition: LICM.cpp:381

IR
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80

LoopInfo.h

mergeBlocksIntoPredecessors
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
Definition: LoopSimplifyCFG.cpp:680

I
#define I(x, y, z)
Definition: MD5.cpp:58

MDBuilder.h

Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74

OpIdx
MachineInstr unsigned OpIdx
Definition: NVPTXPrologEpilogPass.cpp:56

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

II
uint64_t IntrinsicInst * II
Definition: NVVMIntrRange.cpp:46

Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")

P
#define P(N)

if
if(PassOpts->AAPipeline)
Definition: PassBuilderBindings.cpp:64

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition: RISCVRedundantCopyElimination.cpp:71

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

ScalarEvolutionExpander.h

SetVector.h
This file implements a set that has insertion order iteration characteristics.

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39

TypeSize.h

TypeSwitch.h
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...

VPRecipeBuilder.h

VPlanAnalysis.h

VPlanCFG.h

VPlanDominatorTree.h
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.

VPlanHelpers.h
This file contains the declarations of different VPlan-related auxiliary helpers.

VPlanPatternMatch.h

optimizeLatchExitInductionUser
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
Definition: VPlanTransforms.cpp:832

tryToCreateAbstractReductionRecipe
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
Definition: VPlanTransforms.cpp:3250

sinkScalarOperands
static bool sinkScalarOperands(VPlan &Plan)
Definition: VPlanTransforms.cpp:128

simplifyBranchConditionForVFAndUF
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
Definition: VPlanTransforms.cpp:1479

tryToFoldLiveIns
static Value * tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
Definition: VPlanTransforms.cpp:943

removeRedundantInductionCasts
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
Definition: VPlanTransforms.cpp:465

tryToMatchAndCreateExtendedReduction
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
Definition: VPlanTransforms.cpp:3106

createScalarIVSteps
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
Definition: VPlanTransforms.cpp:575

sinkRecurrenceUsersAfterPrevious
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
Definition: VPlanTransforms.cpp:1585

mergeReplicateRegionsIntoSuccessors
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
Definition: VPlanTransforms.cpp:249

addVPLaneMaskPhiAndUpdateExitBranch
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
Definition: VPlanTransforms.cpp:2010

expandVPWidenPointerInduction
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
Definition: VPlanTransforms.cpp:2882

transformRecipestoEVLRecipes
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
Definition: VPlanTransforms.cpp:2238

isDeadRecipe
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
Definition: VPlanTransforms.cpp:531

legalizeAndOptimizeInductions
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
Definition: VPlanTransforms.cpp:637

addReplicateRegions
static void addReplicateRegions(VPlan &Plan)
Definition: VPlanTransforms.cpp:381

simplifyRecipe
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
Definition: VPlanTransforms.cpp:994

removeRedundantExpandSCEVRecipes
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
Definition: VPlanTransforms.cpp:904

isConditionTrueViaVFAndUF
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
Definition: VPlanTransforms.cpp:1447

isConsecutiveInterleaveGroup
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
Definition: VPlanTransforms.cpp:3577

hoistPreviousBeforeFORUsers
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
Definition: VPlanTransforms.cpp:1640

collectUsersRecursively
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
Definition: VPlanTransforms.cpp:613

recursivelyDeleteDeadRecipes
static void recursivelyDeleteDeadRecipes(VPValue *V)
Definition: VPlanTransforms.cpp:921

optimizeEarlyExitInductionUser
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
Definition: VPlanTransforms.cpp:777

getOptimizableIVOf
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
Definition: VPlanTransforms.cpp:719

createReplicateRegion
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
Definition: VPlanTransforms.cpp:338

getPredicatedThenBlock
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
Definition: VPlanTransforms.cpp:227

simplifyBlends
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
Definition: VPlanTransforms.cpp:1297

isAlreadyNarrow
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
Definition: VPlanTransforms.cpp:3608

optimizeVectorInductionWidthForTCAndVFUF
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
Definition: VPlanTransforms.cpp:1376

getPredicatedMask
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
Definition: VPlanTransforms.cpp:217

tryToMatchAndCreateMulAccumulateReduction
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
Definition: VPlanTransforms.cpp:3151

expandVPWidenIntOrFpInduction
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
Definition: VPlanTransforms.cpp:2770

findHeaderMask
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
Definition: VPlanTransforms.cpp:2085

optimizeMaskToEVL
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
Definition: VPlanTransforms.cpp:2170

removeRedundantCanonicalIVs
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
Definition: VPlanTransforms.cpp:498

canNarrowLoad
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
Definition: VPlanTransforms.cpp:3561

narrowToSingleScalarRecipes
static void narrowToSingleScalarRecipes(VPlan &Plan)
Definition: VPlanTransforms.cpp:1261

VPlanTransforms.h
This file provides utility VPlan to VPlan transformations.

VPlanUtils.h

VPlanVerifier.h
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...

VPlan.h
This file contains the declarations of the Vectorization Plan base classes:

VectorUtils.h

RHS
Value * RHS
Definition: X86PartialReduction.cpp:74

LHS
Value * LHS
Definition: X86PartialReduction.cpp:73

IV
static const uint32_t IV[8]
Definition: blake3_impl.h:83

llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:78

llvm::APInt::getActiveBits
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1512

llvm::APInt::Rounding::UP
@ UP

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41

llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:62

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1510

llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:699

llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704

llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791

llvm::CmpPredicate
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23

llvm::ConstantInt::getSigned
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:131

llvm::Constant::getAllOnesValue
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420

llvm::Constant::getNullValue
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:33

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63

llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:124

llvm::DebugLoc::getCompilerGenerated
static DebugLoc getCompilerGenerated()
Definition: DebugLoc.h:163

llvm::DenseMapBase::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203

llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245

llvm::DenseMap
Definition: DenseMap.h:730

llvm::DominatorTreeBase::dominates
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Definition: GenericDomTree.h:467

llvm::DominatorTreeBase::recalculate
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Definition: GenericDomTree.h:859

llvm::ElementCount
Definition: TypeSize.h:301

llvm::FPMathOperator
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:200

llvm::GEPNoWrapFlags
Represents flags for the getelementptr instruction/expression.
Definition: GEPNoWrapFlags.h:26

llvm::GEPNoWrapFlags::withoutNoUnsignedWrap
GEPNoWrapFlags withoutNoUnsignedWrap() const
Definition: GEPNoWrapFlags.h:73

llvm::GEPNoWrapFlags::none
static GEPNoWrapFlags none()
Definition: GEPNoWrapFlags.h:46

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949

llvm::InductionDescriptor
A struct for saving information about induction variables.
Definition: IVDescriptors.h:380

llvm::InductionDescriptor::InductionKind
InductionKind
This enum represents the kinds of inductions that we support.
Definition: IVDescriptors.h:383

llvm::InductionDescriptor::IK_PtrInduction
@ IK_PtrInduction
Pointer induction var. Step = C.
Definition: IVDescriptors.h:386

llvm::InductionDescriptor::IK_IntInduction
@ IK_IntInduction
Integer induction variable. Step = C.
Definition: IVDescriptors.h:385

llvm::Init
Definition: Record.h:286

llvm::InstSimplifyFolder
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
Definition: InstSimplifyFolder.h:36

llvm::InstSimplifyFolder::FoldGEP
Value * FoldGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, GEPNoWrapFlags NW) const override
Definition: InstSimplifyFolder.h:81

llvm::InstSimplifyFolder::FoldBinOp
Value * FoldBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS) const override
Definition: InstSimplifyFolder.h:52

llvm::InstSimplifyFolder::FoldSelect
Value * FoldSelect(Value *C, Value *True, Value *False) const override
Definition: InstSimplifyFolder.h:86

llvm::InstSimplifyFolder::FoldCast
Value * FoldCast(Instruction::CastOps Op, Value *V, Type *DestTy) const override
Definition: InstSimplifyFolder.h:117

llvm::InstSimplifyFolder::FoldCmp
Value * FoldCmp(CmpInst::Predicate P, Value *LHS, Value *RHS) const override
Definition: InstSimplifyFolder.h:77

llvm::InstructionCost
Definition: InstructionCost.h:30

llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:80

llvm::Instruction
Definition: Instruction.h:69

llvm::Instruction::isCast
bool isCast() const
Definition: Instruction.h:321

llvm::Instruction::isBinaryOp
bool isBinaryOp() const
Definition: Instruction.h:317

llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:998

llvm::Instruction::getDataLayout
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:86

llvm::Instruction::CastOps
CastOps
Definition: Instruction.h:1012

llvm::IntegerType::get
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319

llvm::InterleaveGroup
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:524

llvm::InterleaveGroup::getMember
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:594

llvm::InterleaveGroup::getNumMembers
uint32_t getNumMembers() const
Definition: VectorUtils.h:542

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:180

llvm::LoopVectorizationPlanner::getDecisionAndClampRange
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1567

llvm::MDBuilder
Definition: MDBuilder.h:37

llvm::MDBuilder::createBranchWeights
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition: MDBuilder.cpp:38

llvm::MDNode
Metadata node.
Definition: Metadata.h:1077

llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36

llvm::MapVector::lookup
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:99

llvm::PHINode
Definition: Instructions.h:2638

llvm::PredicatedScalarEvolution
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
Definition: ScalarEvolution.h:2423

llvm::PredicatedScalarEvolution::getSE
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
Definition: ScalarEvolution.h:2464

llvm::RecurrenceDescriptor::getOpcode
unsigned getOpcode() const
Definition: IVDescriptors.h:227

llvm::RegionBase::getParent
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:362

llvm::Region
Definition: RegionInfo.h:887

llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:299

llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:64

llvm::SCEVExpander::expandCodeFor
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
Definition: ScalarEvolutionExpander.cpp:1490

llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:72

llvm::SCEV::isZero
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
Definition: ScalarEvolution.cpp:445

llvm::SCEV::getType
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:383

llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:448

llvm::ScalarEvolution::getDataLayout
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
Definition: ScalarEvolution.h:1306

llvm::ScalarEvolution::getNegativeSCEV
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Definition: ScalarEvolution.cpp:4594

llvm::ScalarEvolution::getSCEV
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4572

llvm::ScalarEvolution::getUDivExpr
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:3426

llvm::ScalarEvolution::getElementCount
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Definition: ScalarEvolution.cpp:503

llvm::ScalarEvolution::getMulExpr
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:3117

llvm::ScalarEvolution::isKnownPredicate
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
Definition: ScalarEvolution.cpp:11131

llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1689

llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:59

llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168

llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:99

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition: SmallPtrSet.h:98

llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401

llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:494

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541

llvm::SmallVectorBase::empty
bool empty() const
Definition: SmallVector.h:82

llvm::SmallVectorBase::size
size_t size() const
Definition: SmallVector.h:79

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition: SmallVector.h:674

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:414

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197

llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:296

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:285

llvm::TargetTransformInfo::getExtendedReductionCost
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
Definition: TargetTransformInfo.cpp:1278

llvm::TargetTransformInfo::getMulAccReductionCost
LLVM_ABI InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
Definition: TargetTransformInfo.cpp:1285

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:271

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:272

llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82

llvm::TypeSwitch
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87

llvm::TypeSwitch::Case
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45

llvm::Type::getInt8Ty
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)

llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)

llvm::Type::isStructTy
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:261

llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240

llvm::User
Definition: User.h:44

llvm::User::operands
op_range operands()
Definition: User.h:292

llvm::VPActiveLaneMaskPHIRecipe
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition: VPlan.h:3352

llvm::VPBasicBlock
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3639

llvm::VPBasicBlock::appendRecipe
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3714

llvm::VPBasicBlock::iterator
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3666

llvm::VPBasicBlock::end
iterator end()
Definition: VPlan.h:3676

llvm::VPBasicBlock::begin
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3674

llvm::VPBasicBlock::phis
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3727

llvm::VPBasicBlock::getFirstNonPhi
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:236

llvm::VPBasicBlock::getEnclosingLoopRegion
VPRegionBlock * getEnclosingLoopRegion()
Definition: VPlan.cpp:622

llvm::VPBasicBlock::splitAt
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition: VPlan.cpp:594

llvm::VPBasicBlock::getTerminator
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition: VPlan.cpp:667

llvm::VPBasicBlock::back
const VPRecipeBase & back() const
Definition: VPlan.h:3688

llvm::VPBlendRecipe
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2374

llvm::VPBlockBase
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:81

llvm::VPBlockBase::getParent
VPRegionBlock * getParent()
Definition: VPlan.h:173

llvm::VPBlockBase::getExitingBasicBlock
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:180

llvm::VPBlockBase::swapSuccessors
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:319

llvm::VPBlockBase::getNumPredecessors
size_t getNumPredecessors() const
Definition: VPlan.h:220

llvm::VPBlockBase::getPredecessors
const VPBlocksTy & getPredecessors() const
Definition: VPlan.h:204

llvm::VPBlockBase::getPlan
VPlan * getPlan()
Definition: VPlan.cpp:155

llvm::VPBlockBase::getSinglePredecessor
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:215

llvm::VPBlockBase::getEntryBasicBlock
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:160

llvm::VPBlockBase::getSingleHierarchicalPredecessor
VPBlockBase * getSingleHierarchicalPredecessor()
Definition: VPlan.h:261

llvm::VPBlockBase::getSingleSuccessor
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:209

llvm::VPBlockBase::getSuccessors
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:198

llvm::VPBlockUtils::insertOnEdge
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlanUtils.h:237

llvm::VPBlockUtils::insertTwoBlocksAfter
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition: VPlanUtils.h:156

llvm::VPBlockUtils::connectBlocks
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:175

llvm::VPBlockUtils::disconnectBlocks
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition: VPlanUtils.h:194

llvm::VPBranchOnMaskRecipe
A recipe for generating conditional branches on the bits of a mask.
Definition: VPlan.h:2809

llvm::VPBuilder::InsertPointGuard
RAII object that stores the current insertion point and restores it when the object is destroyed.
Definition: LoopVectorizationPlanner.h:347

llvm::VPBuilder
VPlan-based builder utility analogous to IRBuilder.
Definition: LoopVectorizationPlanner.h:54

llvm::VPBuilder::createScalarZExtOrTrunc
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
Definition: LoopVectorizationPlanner.h:312

llvm::VPBuilder::createElementCount
VPValue * createElementCount(Type *Ty, ElementCount EC)
Definition: LoopVectorizationPlanner.h:281

llvm::VPBuilder::createNot
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition: LoopVectorizationPlanner.h:190

llvm::VPBuilder::createDerivedIV
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
Definition: LoopVectorizationPlanner.h:298

llvm::VPBuilder::createSelect
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
Definition: LoopVectorizationPlanner.h:219

llvm::VPBuilder::createScalarIVSteps
VPScalarIVStepsRecipe * createScalarIVSteps(Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step, VPValue *VF, DebugLoc DL)
Definition: LoopVectorizationPlanner.h:329

llvm::VPBuilder::createPtrAdd
VPInstruction * createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition: LoopVectorizationPlanner.h:252

llvm::VPBuilder::createOverflowingOp
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition: LoopVectorizationPlanner.h:181

llvm::VPBuilder::getToInsertAfter
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
Definition: LoopVectorizationPlanner.h:90

llvm::VPBuilder::createScalarPhi
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
Definition: LoopVectorizationPlanner.h:276

llvm::VPBuilder::createWidenCast
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
Definition: LoopVectorizationPlanner.h:323

llvm::VPBuilder::createICmp
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
Definition: LoopVectorizationPlanner.h:232

llvm::VPBuilder::createNaryOp
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Definition: LoopVectorizationPlanner.h:151

llvm::VPBuilder::createScalarCast
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
Definition: LoopVectorizationPlanner.h:306

llvm::VPBuilder::setInsertPoint
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Definition: LoopVectorizationPlanner.h:126

llvm::VPCanonicalIVPHIRecipe
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:3295

llvm::VPCanonicalIVPHIRecipe::getScalarType
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:3322

llvm::VPDef::getNumDefinedValues
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition: VPlanValue.h:421

llvm::VPDef::definedValues
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:416

llvm::VPDef::getVPSingleValue
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:394

llvm::VPDef::getVPValue
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:406

llvm::VPDominatorTree
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
Definition: VPlanDominatorTree.h:38

llvm::VPDominatorTree::properlyDominates
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
Definition: VPlanAnalysis.cpp:357

llvm::VPEVLBasedIVPHIRecipe
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition: VPlan.h:3383

llvm::VPExpressionRecipe
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition: VPlan.h:2854

llvm::VPHeaderPHIRecipe::getBackedgeValue
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1999

llvm::VPHeaderPHIRecipe::getStartValue
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1988

llvm::VPIRBasicBlock
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3792

llvm::VPIRFlags
Class to record and manage LLVM IR flags.
Definition: VPlan.h:596

llvm::VPIRInstruction::create
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Definition: VPlanRecipes.cpp:1470

llvm::VPIRMetadata
Helper to manage IR metadata for recipes.
Definition: VPlan.h:926

llvm::VPIRMetadata::intersect
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
Definition: VPlanRecipes.cpp:1593

llvm::VPInstruction
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:967

llvm::VPInstruction::BranchOnCond
@ BranchOnCond
Definition: VPlan.h:985

llvm::VPInstruction::PtrAdd
@ PtrAdd
Definition: VPlan.h:1011

llvm::VPInstruction::ExtractLane
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition: VPlan.h:1040

llvm::VPInstruction::Broadcast
@ Broadcast
Definition: VPlan.h:986

llvm::VPInstruction::BranchOnCount
@ BranchOnCount
Definition: VPlan.h:984

llvm::VPInstruction::ActiveLaneMask
@ ActiveLaneMask
Definition: VPlan.h:979

llvm::VPInstruction::FirstActiveLane
@ FirstActiveLane
Definition: VPlan.h:1024

llvm::VPInstruction::FirstOrderRecurrenceSplice
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:973

llvm::VPInstruction::ExplicitVectorLength
@ ExplicitVectorLength
Definition: VPlan.h:980

llvm::VPInstruction::BuildVector
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition: VPlan.h:993

llvm::VPInstruction::WidePtrAdd
@ WidePtrAdd
Definition: VPlan.h:1014

llvm::VPInstruction::LogicalAnd
@ LogicalAnd
Definition: VPlan.h:1007

llvm::VPInstruction::BuildStructVector
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition: VPlan.h:990

llvm::VPInstruction::CanonicalIVIncrementForPart
@ CanonicalIVIncrementForPart
Definition: VPlan.h:983

llvm::VPInstruction::Not
@ Not
Definition: VPlan.h:976

llvm::VPInstruction::CalculateTripCountMinusVF
@ CalculateTripCountMinusVF
Definition: VPlan.h:981

llvm::VPInstruction::StepVector
@ StepVector
Definition: VPlan.h:1036

llvm::VPInstruction::AnyOf
@ AnyOf
Definition: VPlan.h:1020

llvm::VPInstruction::isVectorToScalar
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
Definition: VPlanRecipes.cpp:1126

llvm::VPInterleaveRecipe
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2443

llvm::VPInterleaveRecipe::getStoredValues
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2500

llvm::VPInterleaveRecipe::getInterleaveGroup
const InterleaveGroup< Instruction > * getInterleaveGroup()
Definition: VPlan.h:2520

llvm::VPPredInstPHIRecipe
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition: VPlan.h:2966

llvm::VPRecipeBase
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:391

llvm::VPRecipeBase::mayReadFromMemory
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
Definition: VPlanRecipes.cpp:101

llvm::VPRecipeBase::mayReadOrWriteMemory
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition: VPlan.h:474

llvm::VPRecipeBase::mayHaveSideEffects
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
Definition: VPlanRecipes.cpp:149

llvm::VPRecipeBase::getParent
VPBasicBlock * getParent()
Definition: VPlan.h:412

llvm::VPRecipeBase::getDebugLoc
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:479

llvm::VPRecipeBase::moveBefore
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
Definition: VPlanRecipes.cpp:244

llvm::VPRecipeBase::insertBefore
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
Definition: VPlanRecipes.cpp:207

llvm::VPRecipeBase::insertAfter
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
Definition: VPlanRecipes.cpp:221

llvm::VPRecipeBase::eraseFromParent
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: VPlanRecipes.cpp:234

llvm::VPRecipeBuilder
Helper class to create VPRecipies from IR instructions.
Definition: VPRecipeBuilder.h:47

llvm::VPRecipeBuilder::getRecipe
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
Definition: VPRecipeBuilder.h:192

llvm::VPReductionEVLRecipe
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition: VPlan.h:2687

llvm::VPReductionRecipe
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2541

llvm::VPRegionBlock
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3827

llvm::VPRegionBlock::getEntry
const VPBlockBase * getEntry() const
Definition: VPlan.h:3863

llvm::VPRegionBlock::setExiting
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition: VPlan.h:3880

llvm::VPRegionBlock::getExiting
const VPBlockBase * getExiting() const
Definition: VPlan.h:3875

llvm::VPRegionBlock::getPreheaderVPBB
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition: VPlan.h:3888

llvm::VPReplicateRecipe
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2731

llvm::VPReplicateRecipe::isSingleScalar
bool isSingleScalar() const
Definition: VPlan.h:2776

llvm::VPReplicateRecipe::getMask
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition: VPlan.h:2800

llvm::VPScalarIVStepsRecipe
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition: VPlan.h:3529

llvm::VPSingleDefRecipe
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:518

llvm::VPSingleDefRecipe::getUnderlyingInstr
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:582

llvm::VPSingleDefRecipe::clone
virtual VPSingleDefRecipe * clone() override=0
Clone the current recipe.

llvm::VPTypeAnalysis
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:43

llvm::VPTypeAnalysis::getContext
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Definition: VPlanAnalysis.h:67

llvm::VPTypeAnalysis::inferScalarType
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Definition: VPlanAnalysis.cpp:260

llvm::VPUser
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:197

llvm::VPUser::operands
operand_range operands()
Definition: VPlanValue.h:265

llvm::VPUser::setOperand
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:241

llvm::VPUser::op_end
operand_iterator op_end()
Definition: VPlanValue.h:263

llvm::VPUser::op_begin
operand_iterator op_begin()
Definition: VPlanValue.h:261

llvm::VPUser::getOperand
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:236

llvm::VPUser::addOperand
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:230

llvm::VPValue
Definition: VPlanValue.h:48

llvm::VPValue::getDefiningRecipe
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:125

llvm::VPValue::getLiveInIRValue
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:174

llvm::VPValue::getUnderlyingValue
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:85

llvm::VPValue::setUnderlyingValue
void setUnderlyingValue(Value *Val)
Definition: VPlanValue.h:184

llvm::VPValue::replaceAllUsesWith
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1403

llvm::VPValue::getNumUsers
unsigned getNumUsers() const
Definition: VPlanValue.h:113

llvm::VPValue::isLiveIn
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:169

llvm::VPValue::replaceUsesWithIf
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1407

llvm::VPValue::users
user_range users()
Definition: VPlanValue.h:134

llvm::VPVectorEndPointerRecipe
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition: VPlan.h:1817

llvm::VPVectorEndPointerRecipe::clone
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition: VPlan.h:1861

llvm::VPWidenCanonicalIVRecipe
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3424

llvm::VPWidenCastRecipe
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1467

llvm::VPWidenGEPRecipe
A recipe for handling GEP instructions.
Definition: VPlan.h:1753

llvm::VPWidenInductionRecipe
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:2013

llvm::VPWidenInductionRecipe::getVFValue
VPValue * getVFValue()
Definition: VPlan.h:2047

llvm::VPWidenInductionRecipe::getPHINode
PHINode * getPHINode() const
Definition: VPlan.h:2055

llvm::VPWidenInductionRecipe::getStepValue
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:2041

llvm::VPWidenInductionRecipe::getInductionDescriptor
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:2058

llvm::VPWidenIntOrFpInductionRecipe
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:2088

llvm::VPWidenIntOrFpInductionRecipe::getLastUnrolledPartOperand
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition: VPlan.h:2169

llvm::VPWidenIntOrFpInductionRecipe::getSplatVFValue
VPValue * getSplatVFValue()
Definition: VPlan.h:2139

llvm::VPWidenIntrinsicRecipe
A recipe for widening vector intrinsics.
Definition: VPlan.h:1524

llvm::VPWidenMemoryRecipe
A common base class for widening memory operations.
Definition: VPlan.h:3008

llvm::VPWidenMemoryRecipe::getMask
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:3070

llvm::VPWidenMemoryRecipe::getAddr
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:3063

llvm::VPWidenPHIRecipe
A recipe for widened phis.
Definition: VPlan.h:2224

llvm::VPWidenPointerInductionRecipe
Definition: VPlan.h:2174

llvm::VPWidenRecipe
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1424

llvm::VPlan
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3930

llvm::VPlan::hasVF
bool hasVF(ElementCount VF) const
Definition: VPlan.h:4139

llvm::VPlan::getContext
LLVMContext & getContext() const
Definition: VPlan.h:4127

llvm::VPlan::getEntry
VPBasicBlock * getEntry()
Definition: VPlan.h:4029

llvm::VPlan::createVPRegionBlock
VPRegionBlock * createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="", bool IsReplicator=false)
Create a new VPRegionBlock with Entry, Exiting and Name.
Definition: VPlan.h:4270

llvm::VPlan::getVectorTripCount
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:4119

llvm::VPlan::hasScalableVF
bool hasScalableVF() const
Definition: VPlan.h:4140

llvm::VPlan::getVFxUF
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:4125

llvm::VPlan::getVF
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:4122

llvm::VPlan::getTripCount
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:4091

llvm::VPlan::getTrue
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition: VPlan.h:4196

llvm::VPlan::getOrCreateBackedgeTakenCount
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:4112

llvm::VPlan::getUF
unsigned getUF() const
Definition: VPlan.h:4159

llvm::VPlan::hasUF
bool hasUF(unsigned UF) const
Definition: VPlan.h:4157

llvm::VPlan::getExitBlocks
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition: VPlan.h:4081

llvm::VPlan::setVF
void setVF(ElementCount VF)
Definition: VPlan.h:4133

llvm::VPlan::isUnrolled
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition: VPlan.h:4172

llvm::VPlan::getVectorLoopRegion
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1037

llvm::VPlan::resetTripCount
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:4105

llvm::VPlan::getMiddleBlock
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:4054

llvm::VPlan::createVPBasicBlock
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition: VPlan.h:4260

llvm::VPlan::getOrAddLiveIn
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:4181

llvm::VPlan::hasScalarVFOnly
bool hasScalarVFOnly() const
Definition: VPlan.h:4150

llvm::VPlan::getScalarPreheader
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:4072

llvm::VPlan::getLiveIns
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition: VPlan.h:4211

llvm::VPlan::getCanonicalIV
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:4235

llvm::VPlan::getScalarHeader
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:4077

llvm::VPlan::getVectorPreheader
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition: VPlan.h:4034

llvm::VPlan::setUF
void setUF(unsigned UF)
Definition: VPlan.h:4164

llvm::VPlan::hasScalarTail
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition: VPlan.h:4313

llvm::Value
LLVM Value Representation.
Definition: Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256

llvm::Value::setName
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:390

llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:426

llvm::Value::hasName
bool hasName() const
Definition: Value.h:262

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203

llvm::details::FixedOrScalableQuantity::isScalable
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172

llvm::details::FixedOrScalableQuantity::multiplyCoefficientBy
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:259

llvm::details::FixedOrScalableQuantity::isFixed
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:175

llvm::details::FixedOrScalableQuantity::getKnownMinValue
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169

llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:37

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition: ilist_node.h:34

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:134

llvm::iterator_range::end
IteratorT end() const
Definition: iterator_range.h:65

uint64_t

unsigned

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:164

llvm::APIntOps::RoundingUDiv
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition: APInt.cpp:2763

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34

llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition: Intrinsics.h:46

llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
Definition: MIPatternMatch.h:277

llvm::MIPatternMatch::m_Not
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
Definition: MIPatternMatch.h:936

llvm::PatternMatch::m_AllOnes
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524

llvm::PatternMatch::m_Trunc
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
Definition: PatternMatch.h:2211

llvm::PatternMatch::m_SpecificInt
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:1060

llvm::PatternMatch::m_ZExtOrSelf
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
Definition: PatternMatch.h:2254

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962

llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition: PatternMatch.h:1928

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1248

llvm::PatternMatch::m_Deferred
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:980

llvm::PatternMatch::m_SpecificICmp
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1817

llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105

llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2944

llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1754

llvm::PatternMatch::m_ZExtOrSExt
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
Definition: PatternMatch.h:2274

llvm::PatternMatch::m_LogicalAnd
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
Definition: PatternMatch.h:3214

llvm::PatternMatch::m_SExt
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
Definition: PatternMatch.h:2237

llvm::PatternMatch::m_c_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
Definition: PatternMatch.h:2951

llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1194

llvm::VPlanPatternMatch::m_c_BinaryAnd
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
Definition: VPlanPatternMatch.h:413

llvm::VPlanPatternMatch::m_BinaryOr
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
Definition: VPlanPatternMatch.h:423

llvm::VPlanPatternMatch::m_c_BinaryOr
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Definition: VPlanPatternMatch.h:429

llvm::VPlanPatternMatch::m_ActiveLaneMask
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1)
Definition: VPlanPatternMatch.h:331

llvm::VPlanPatternMatch::m_GetElementPtr
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
Definition: VPlanPatternMatch.h:533

llvm::VPlanPatternMatch::m_ExtractLastElement
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
Definition: VPlanPatternMatch.h:325

llvm::VPlanPatternMatch::m_False
specific_intval< 1 > m_False()
Definition: VPlanPatternMatch.h:130

llvm::VPlanPatternMatch::m_DerivedIV
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
Definition: VPlanPatternMatch.h:586

llvm::VPlanPatternMatch::m_BranchOnCount
VPInstruction_match< VPInstruction::BranchOnCount, Op0_t, Op1_t > m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1)
Definition: VPlanPatternMatch.h:337

llvm::VPlanPatternMatch::m_True
specific_intval< 1 > m_True()
Definition: VPlanPatternMatch.h:134

llvm::VPlanPatternMatch::m_Broadcast
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
Definition: VPlanPatternMatch.h:313

llvm::VPlanPatternMatch::m_VPValue
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
Definition: VPlanPatternMatch.h:37

llvm::VPlanPatternMatch::m_EVL
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
Definition: VPlanPatternMatch.h:319

llvm::VPlanPatternMatch::m_BuildVector
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
Definition: VPlanPatternMatch.h:295

llvm::VPlanPatternMatch::m_BranchOnCond
VPInstruction_match< VPInstruction::BranchOnCond, Op0_t > m_BranchOnCond(const Op0_t &Op0)
Definition: VPlanPatternMatch.h:307

llvm::VPlanPatternMatch::m_VPInstruction
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
Definition: VPlanPatternMatch.h:197

llvm::vputils::isSingleScalar
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition: VPlanUtils.h:44

llvm::vputils::getOrCreateVPValueForSCEVExpr
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:32

llvm::vputils::getSCEVExprForVPValue
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:79

llvm::vputils::onlyFirstLaneUsed
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
Definition: VPlanUtils.cpp:17

llvm::vputils::isHeaderMask
bool isHeaderMask(const VPValue *V, VPlan &Plan)
Return true if V is a header mask in Plan.
Definition: VPlanUtils.cpp:55

llvm::vputils::onlyScalarValuesUsed
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
Definition: VPlanUtils.cpp:27

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338

llvm::Offset
@ Offset
Definition: DWP.cpp:477

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744

llvm::getVectorIntrinsicIDForCall
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
Definition: VectorUtils.cpp:235

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491

llvm::getLoadStorePointerOperand
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Definition: Instructions.h:5088

llvm::from_range
constexpr from_range_t from_range
Definition: STLForwardCompat.h:79

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:77

llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2155

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663

llvm::vp_depth_first_shallow
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:216

llvm::vp_depth_first_deep
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:243

llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751

llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428

llvm::vp_post_order_deep
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition: VPlanCFG.h:236

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669

llvm::VPlanPtr
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:77

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758

llvm::to_vector
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1300

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548

llvm::drop_end
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:345

llvm::RecurKind
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34

llvm::RecurKind::Mul
@ Mul
Product of integers.

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.

llvm::RecurKind::Add
@ Add
Sum of integers.

llvm::RecurKind::AddChainWithSubs
@ AddChainWithSubs
A chain of adds and subs.

llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980

llvm::SplitBlock
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
Definition: BasicBlockUtils.cpp:920

llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777

llvm::getLoadStoreType
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Definition: Instructions.h:5143

llvm::TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...

llvm::toVectorTy
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorTypeUtils.h:20

llvm::InstructionUniformity::Default
@ Default
The result values are uniform if and only if all operands are uniform.

llvm::IsaPred
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition: Casting.h:836

N
#define N

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39

llvm::Incoming
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Definition: SILowerI1Copies.h:26

llvm::VFRange
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlanHelpers.h:71

llvm::VPCostContext
Struct to hold various analysis needed for cost computations.
Definition: VPlanHelpers.h:344

llvm::VPCostContext::Types
VPTypeAnalysis Types
Definition: VPlanHelpers.h:347

llvm::VPCostContext::TTI
const TargetTransformInfo & TTI
Definition: VPlanHelpers.h:345

llvm::VPFirstOrderRecurrencePHIRecipe
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2267

llvm::VPPhi
Definition: VPlan.h:1289

llvm::VPWidenLoadEVLRecipe
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition: VPlan.h:3130

llvm::VPWidenLoadRecipe
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:3090

llvm::VPWidenSelectRecipe
A recipe for widening select instructions.
Definition: VPlan.h:1707

llvm::VPWidenStoreEVLRecipe
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition: VPlan.h:3212

llvm::VPWidenStoreRecipe
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:3170

llvm::VPlanTransforms::materializeBroadcasts
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
Definition: VPlanTransforms.cpp:3279

llvm::VPlanTransforms::materializeBackedgeTakenCount
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
Definition: VPlanTransforms.cpp:3355

llvm::VPlanTransforms::optimizeInductionExitUsers
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
Definition: VPlanTransforms.cpp:877

llvm::VPlanTransforms::canonicalizeEVLLoops
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
Definition: VPlanTransforms.cpp:2464

llvm::VPlanTransforms::dropPoisonGeneratingRecipes
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
Definition: VPlanTransforms.cpp:2545

llvm::VPlanTransforms::createAndOptimizeReplicateRegions
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
Definition: VPlanTransforms.cpp:447

llvm::VPlanTransforms::createInterleaveGroups
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
Definition: VPlanTransforms.cpp:2642

llvm::VPlanTransforms::runPass
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
Definition: VPlanTransforms.h:40

llvm::VPlanTransforms::addBranchWeightToMiddleTerminator
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
Definition: VPlanTransforms.cpp:3782

llvm::VPlanTransforms::materializeBuildVectors
static void materializeBuildVectors(VPlan &Plan)
Add explicit Build[Struct]Vector recipes that combine multiple scalar values into single vectors.
Definition: VPlanTransforms.cpp:3370

llvm::VPlanTransforms::expandSCEVs
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
Definition: VPlanTransforms.cpp:3512

llvm::VPlanTransforms::convertToConcreteRecipes
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
Definition: VPlanTransforms.cpp:2936

llvm::VPlanTransforms::convertToAbstractRecipes
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
Definition: VPlanTransforms.cpp:3268

llvm::VPlanTransforms::materializeConstantVectorTripCount
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Definition: VPlanTransforms.cpp:3325

llvm::VPlanTransforms::addExplicitVectorLength
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
Definition: VPlanTransforms.cpp:2405

llvm::VPlanTransforms::removeBranchOnConst
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
Definition: VPlanTransforms.cpp:1926

llvm::VPlanTransforms::removeDeadRecipes
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
Definition: VPlanTransforms.cpp:548

llvm::VPlanTransforms::materializeVectorTripCount
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
Definition: VPlanTransforms.cpp:3416

llvm::VPlanTransforms::simplifyRecipes
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
Definition: VPlanTransforms.cpp:1250

llvm::VPlanTransforms::tryToConvertVPInstructionsToVPRecipes
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
Definition: VPlanTransforms.cpp:42

llvm::VPlanTransforms::handleUncountableEarlyExit
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
Definition: VPlanTransforms.cpp:3016

llvm::VPlanTransforms::clearReductionWrapFlags
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
Definition: VPlanTransforms.cpp:1784

llvm::VPlanTransforms::addActiveLaneMask
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
Definition: VPlanTransforms.cpp:2127

llvm::VPlanTransforms::optimize
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
Definition: VPlanTransforms.cpp:1958

llvm::VPlanTransforms::dissolveLoopRegions
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
Definition: VPlanTransforms.cpp:2924

llvm::VPlanTransforms::narrowInterleaveGroups
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
Definition: VPlanTransforms.cpp:3615

llvm::VPlanTransforms::truncateToMinimalBitwidths
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
Definition: VPlanTransforms.cpp:1836

llvm::VPlanTransforms::adjustFixedOrderRecurrences
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
Definition: VPlanTransforms.cpp:1736

llvm::VPlanTransforms::optimizeForVFAndUF
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
Definition: VPlanTransforms.cpp:1562

llvm::VPlanTransforms::materializeVFAndVFxUF
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
Definition: VPlanTransforms.cpp:3477