LLVM: lib/Transforms/Vectorize/SLPVectorizer.cpp Source File

//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This pass implements the Bottom Up SLP vectorizer. It detects consecutive

// stores that can be put together into vector-stores. Next, it attempts to

// construct vectorizable tree using the use-def chains. If a profitable tree

// was found, the SLP vectorizer performs vectorization on the tree.

//

// The pass is inspired by the work described in the paper:

//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.

//

//===----------------------------------------------------------------------===//


#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/DenseSet.h"

#include "llvm/ADT/PriorityQueue.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/ScopeExit.h"

#include "llvm/ADT/SetOperations.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/SmallBitVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallString.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/iterator.h"

#include "llvm/ADT/iterator_range.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/AssumptionCache.h"

#include "llvm/Analysis/CodeMetrics.h"

#include "llvm/Analysis/ConstantFolding.h"

#include "llvm/Analysis/DemandedBits.h"

#include "llvm/Analysis/GlobalsModRef.h"

#include "llvm/Analysis/IVDescriptors.h"

#include "llvm/Analysis/Loads.h"

#include "llvm/Analysis/LoopAccessAnalysis.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/MemoryLocation.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/ScalarEvolution.h"

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

#include "llvm/Analysis/TargetLibraryInfo.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/BasicBlock.h"

#include "llvm/IR/Constant.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Dominators.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/InstrTypes.h"

#include "llvm/IR/Instruction.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/Operator.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Use.h"

#include "llvm/IR/User.h"

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#ifdef EXPENSIVE_CHECKS

#include "llvm/IR/Verifier.h"

#endif

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Compiler.h"

#include "llvm/Support/DOTGraphTraits.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/DebugCounter.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/GraphWriter.h"

#include "llvm/Support/InstructionCost.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Utils/InjectTLIMappings.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

#include <algorithm>

#include <cassert>

#include <cstdint>

#include <iterator>

#include <memory>

#include <optional>

#include <set>

#include <string>

#include <tuple>

#include <utility>


using namespace llvm;

using namespace llvm::PatternMatch;

using namespace slpvectorizer;

using namespace std::placeholders;


#define SV_NAME "slp-vectorizer"

#define DEBUG_TYPE "SLP"


STATISTIC(NumVectorInstructions, "Number of vector instructions generated");


DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",

              "Controls which SLP graphs should be vectorized.");


static cl::opt<bool>

    RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,

                        cl::desc("Run the SLP vectorization passes"));


static cl::opt<bool>

    SLPReVec("slp-revec", cl::init(false), cl::Hidden,

             cl::desc("Enable vectorization for wider vector utilization"));


static cl::opt<int>

    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,

                     cl::desc("Only vectorize if you gain more than this "

                              "number "));


static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(

    "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,

    cl::desc("When true, SLP vectorizer bypasses profitability checks based on "

             "heuristics and makes vectorization decision via cost modeling."));


static cl::opt<bool>

ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,

                   cl::desc("Attempt to vectorize horizontal reductions"));


static cl::opt<bool> ShouldStartVectorizeHorAtStore(

    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,

    cl::desc(

        "Attempt to vectorize horizontal reductions feeding into a store"));


static cl::opt<bool> SplitAlternateInstructions(

    "slp-split-alternate-instructions", cl::init(true), cl::Hidden,

    cl::desc("Improve the code quality by splitting alternate instructions"));


static cl::opt<int>

MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,

    cl::desc("Attempt to vectorize for this register size in bits"));


static cl::opt<unsigned>

MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,

    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));


/// Limits the size of scheduling regions in a block.

/// It avoid long compile times for _very_ large blocks where vector

/// instructions are spread over a wide range.

/// This limit is way higher than needed by real-world functions.

static cl::opt<int>

ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,

    cl::desc("Limit the size of the SLP scheduling region per block"));


static cl::opt<int> MinVectorRegSizeOption(

    "slp-min-reg-size", cl::init(128), cl::Hidden,

    cl::desc("Attempt to vectorize for this register size in bits"));


static cl::opt<unsigned> RecursionMaxDepth(

    "slp-recursion-max-depth", cl::init(12), cl::Hidden,

    cl::desc("Limit the recursion depth when building a vectorizable tree"));


static cl::opt<unsigned> MinTreeSize(

    "slp-min-tree-size", cl::init(3), cl::Hidden,

    cl::desc("Only vectorize small trees if they are fully vectorizable"));


// The maximum depth that the look-ahead score heuristic will explore.

// The higher this value, the higher the compilation time overhead.

static cl::opt<int> LookAheadMaxDepth(

    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,

    cl::desc("The maximum look-ahead depth for operand reordering scores"));


// The maximum depth that the look-ahead score heuristic will explore

// when it probing among candidates for vectorization tree roots.

// The higher this value, the higher the compilation time overhead but unlike

// similar limit for operands ordering this is less frequently used, hence

// impact of higher value is less noticeable.

static cl::opt<int> RootLookAheadMaxDepth(

    "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,

    cl::desc("The maximum look-ahead depth for searching best rooting option"));


static cl::opt<unsigned> MinProfitableStridedLoads(

    "slp-min-strided-loads", cl::init(2), cl::Hidden,

    cl::desc("The minimum number of loads, which should be considered strided, "

             "if the stride is > 1 or is runtime value"));


static cl::opt<unsigned> MaxProfitableLoadStride(

    "slp-max-stride", cl::init(8), cl::Hidden,

    cl::desc("The maximum stride, considered to be profitable."));


static cl::opt<bool>

    ViewSLPTree("view-slp-tree", cl::Hidden,

                cl::desc("Display the SLP trees with Graphviz"));


static cl::opt<bool> VectorizeNonPowerOf2(

    "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,

    cl::desc("Try to vectorize with non-power-of-2 number of elements."));


/// Enables vectorization of copyable elements.

static cl::opt<bool> VectorizeCopyableElements(

    "slp-copyable-elements", cl::init(true), cl::Hidden,

    cl::desc("Try to replace values with the idempotent instructions for "

             "better vectorization."));


// Limit the number of alias checks. The limit is chosen so that

// it has no negative effect on the llvm benchmarks.

static const unsigned AliasedCheckLimit = 10;


// Limit of the number of uses for potentially transformed instructions/values,

// used in checks to avoid compile-time explode.

static constexpr int UsesLimit = 64;


// Another limit for the alias checks: The maximum distance between load/store

// instructions where alias checks are done.

// This limit is useful for very large basic blocks.

static const unsigned MaxMemDepDistance = 160;


/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling

/// regions to be handled.

static const int MinScheduleRegionSize = 16;


/// Maximum allowed number of operands in the PHI nodes.

static const unsigned MaxPHINumOperands = 128;


/// Predicate for the element types that the SLP vectorizer supports.

///

/// The most important thing to filter here are types which are invalid in LLVM

/// vectors. We also filter target specific types which have absolutely no

/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just

/// avoids spending time checking the cost model and realizing that they will

/// be inevitably scalarized.

static bool isValidElementType(Type *Ty) {

  // TODO: Support ScalableVectorType.

  if (SLPReVec && isa<FixedVectorType>(Ty))

    Ty = Ty->getScalarType();

  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&

         !Ty->isPPC_FP128Ty();

}


/// Returns the type of the given value/instruction \p V. If it is store,

/// returns the type of its value operand, for Cmp - the types of the compare

/// operands and for insertelement - the type os the inserted operand.

/// Otherwise, just the type of the value is returned.

static Type *getValueType(Value *V) {

  if (auto *SI = dyn_cast<StoreInst>(V))

    return SI->getValueOperand()->getType();

  if (auto *CI = dyn_cast<CmpInst>(V))

    return CI->getOperand(0)->getType();

  if (auto *IE = dyn_cast<InsertElementInst>(V))

    return IE->getOperand(1)->getType();

  return V->getType();

}


/// \returns the number of elements for Ty.

static unsigned getNumElements(Type *Ty) {

  assert(!isa<ScalableVectorType>(Ty) &&

         "ScalableVectorType is not supported.");

  if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))

    return VecTy->getNumElements();

  return 1;

}


/// \returns the vector type of ScalarTy based on vectorization factor.

static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {

  return FixedVectorType::get(ScalarTy->getScalarType(),

                              VF * getNumElements(ScalarTy));

}


/// Returns the number of elements of the given type \p Ty, not less than \p Sz,

/// which forms type, which splits by \p TTI into whole vector types during

/// legalization.

static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,

                                              Type *Ty, unsigned Sz) {

  if (!isValidElementType(Ty))

    return bit_ceil(Sz);

  // Find the number of elements, which forms full vectors.

  const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));

  if (NumParts == 0 || NumParts >= Sz)

    return bit_ceil(Sz);

  return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;

}


/// Returns the number of elements of the given type \p Ty, not greater than \p

/// Sz, which forms type, which splits by \p TTI into whole vector types during

/// legalization.

static unsigned

getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,

                                   unsigned Sz) {

  if (!isValidElementType(Ty))

    return bit_floor(Sz);

  // Find the number of elements, which forms full vectors.

  unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));

  if (NumParts == 0 || NumParts >= Sz)

    return bit_floor(Sz);

  unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));

  if (RegVF > Sz)

    return bit_floor(Sz);

  return (Sz / RegVF) * RegVF;

}


static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,

                                                   SmallVectorImpl<int> &Mask) {

  // The ShuffleBuilder implementation use shufflevector to splat an "element".

  // But the element have different meaning for SLP (scalar) and REVEC

  // (vector). We need to expand Mask into masks which shufflevector can use

  // directly.

  SmallVector<int> NewMask(Mask.size() * VecTyNumElements);

  for (unsigned I : seq<unsigned>(Mask.size()))

    for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(

             I * VecTyNumElements, VecTyNumElements)))

      MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem

                                        : Mask[I] * VecTyNumElements + J;

  Mask.swap(NewMask);

}


/// \returns the number of groups of shufflevector

/// A group has the following features

/// 1. All of value in a group are shufflevector.

/// 2. The mask of all shufflevector is isExtractSubvectorMask.

/// 3. The mask of all shufflevector uses all of the elements of the source.

/// e.g., it is 1 group (%0)

/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,

///    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,

///    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

/// it is 2 groups (%3 and %4)

/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,

///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>

/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,

///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>

/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,

///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>

/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,

///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>

/// it is 0 group

/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,

///     <4 x i32> <i32 0, i32 1, i32 2, i32 3>

/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,

///     <4 x i32> <i32 0, i32 1, i32 2, i32 3>

static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {

  if (VL.empty())

    return 0;

  if (!all_of(VL, IsaPred<ShuffleVectorInst>))

    return 0;

  auto *SV = cast<ShuffleVectorInst>(VL.front());

  unsigned SVNumElements =

      cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();

  unsigned ShuffleMaskSize = SV->getShuffleMask().size();

  if (SVNumElements % ShuffleMaskSize != 0)

    return 0;

  unsigned GroupSize = SVNumElements / ShuffleMaskSize;

  if (GroupSize == 0 || (VL.size() % GroupSize) != 0)

    return 0;

  unsigned NumGroup = 0;

  for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {

    auto *SV = cast<ShuffleVectorInst>(VL[I]);

    Value *Src = SV->getOperand(0);

    ArrayRef<Value *> Group = VL.slice(I, GroupSize);

    SmallBitVector ExpectedIndex(GroupSize);

    if (!all_of(Group, [&](Value *V) {

          auto *SV = cast<ShuffleVectorInst>(V);

          // From the same source.

          if (SV->getOperand(0) != Src)

            return false;

          int Index;

          if (!SV->isExtractSubvectorMask(Index))

            return false;

          ExpectedIndex.set(Index / ShuffleMaskSize);

          return true;

        }))

      return 0;

    if (!ExpectedIndex.all())

      return 0;

    ++NumGroup;

  }

  assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");

  return NumGroup;

}


/// \returns a shufflevector mask which is used to vectorize shufflevectors

/// e.g.,

/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,

///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>

/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,

///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>

/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,

///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>

/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,

///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>

/// the result is

/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>

static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {

  assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");

  auto *SV = cast<ShuffleVectorInst>(VL.front());

  unsigned SVNumElements =

      cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();

  SmallVector<int> Mask;

  unsigned AccumulateLength = 0;

  for (Value *V : VL) {

    auto *SV = cast<ShuffleVectorInst>(V);

    for (int M : SV->getShuffleMask())

      Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem

                                         : AccumulateLength + M);

    AccumulateLength += SVNumElements;

  }

  return Mask;

}


/// \returns True if the value is a constant (but not globals/constant

/// expressions).

static bool isConstant(Value *V) {

  return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);

}


/// Checks if \p V is one of vector-like instructions, i.e. undef,

/// insertelement/extractelement with constant indices for fixed vector type or

/// extractvalue instruction.

static bool isVectorLikeInstWithConstOps(Value *V) {

  if (!isa<InsertElementInst, ExtractElementInst>(V) &&

      !isa<ExtractValueInst, UndefValue>(V))

    return false;

  auto *I = dyn_cast<Instruction>(V);

  if (!I || isa<ExtractValueInst>(I))

    return true;

  if (!isa<FixedVectorType>(I->getOperand(0)->getType()))

    return false;

  if (isa<ExtractElementInst>(I))

    return isConstant(I->getOperand(1));

  assert(isa<InsertElementInst>(V) && "Expected only insertelement.");

  return isConstant(I->getOperand(2));

}


/// Returns power-of-2 number of elements in a single register (part), given the

/// total number of elements \p Size and number of registers (parts) \p

/// NumParts.

static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {

  return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));

}


/// Returns correct remaining number of elements, considering total amount \p

/// Size, (power-of-2 number) of elements in a single register \p PartNumElems

/// and current register (part) \p Part.

static unsigned getNumElems(unsigned Size, unsigned PartNumElems,

                            unsigned Part) {

  return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);

}


#if !defined(NDEBUG)

/// Print a short descriptor of the instruction bundle suitable for debug output.

static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {

  std::string Result;

  raw_string_ostream OS(Result);

  if (Idx >= 0)

    OS << "Idx: " << Idx << ", ";

  OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";

  return Result;

}

#endif


/// \returns true if all of the instructions in \p VL are in the same block or

/// false otherwise.

static bool allSameBlock(ArrayRef<Value *> VL) {

  auto *It = find_if(VL, IsaPred<Instruction>);

  if (It == VL.end())

    return false;

  Instruction *I0 = cast<Instruction>(*It);

  if (all_of(VL, isVectorLikeInstWithConstOps))

    return true;


  BasicBlock *BB = I0->getParent();

  for (Value *V : iterator_range(It, VL.end())) {

    if (isa<PoisonValue>(V))

      continue;

    auto *II = dyn_cast<Instruction>(V);

    if (!II)

      return false;


    if (BB != II->getParent())

      return false;

  }

  return true;

}


/// \returns True if all of the values in \p VL are constants (but not

/// globals/constant expressions).

static bool allConstant(ArrayRef<Value *> VL) {

  // Constant expressions and globals can't be vectorized like normal integer/FP

  // constants.

  return all_of(VL, isConstant);

}


/// \returns True if all of the values in \p VL are identical or some of them

/// are UndefValue.

static bool isSplat(ArrayRef<Value *> VL) {

  Value *FirstNonUndef = nullptr;

  for (Value *V : VL) {

    if (isa<UndefValue>(V))

      continue;

    if (!FirstNonUndef) {

      FirstNonUndef = V;

      continue;

    }

    if (V != FirstNonUndef)

      return false;

  }

  return FirstNonUndef != nullptr;

}


/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.

/// For BinaryOperator, it also checks if \p InstWithUses is used in specific

/// patterns that make it effectively commutative (like equality comparisons

/// with zero).

/// In most cases, users should not call this function directly (since \p I and

/// \p InstWithUses are the same). However, when analyzing interchangeable

/// instructions, we need to use the converted opcode along with the original

/// uses.

/// \param I The instruction to check for commutativity

/// \param ValWithUses The value whose uses are analyzed for special

/// patterns

static bool isCommutative(Instruction *I, Value *ValWithUses) {

  if (auto *Cmp = dyn_cast<CmpInst>(I))

    return Cmp->isCommutative();

  if (auto *BO = dyn_cast<BinaryOperator>(I))

    return BO->isCommutative() ||

           (BO->getOpcode() == Instruction::Sub &&

            !ValWithUses->hasNUsesOrMore(UsesLimit) &&

            all_of(

                ValWithUses->uses(),

                [](const Use &U) {

                  // Commutative, if icmp eq/ne sub, 0

                  CmpPredicate Pred;

                  if (match(U.getUser(),

                            m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&

                      (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))

                    return true;

                  // Commutative, if abs(sub nsw, true) or abs(sub, false).

                  ConstantInt *Flag;

                  return match(U.getUser(),

                               m_Intrinsic<Intrinsic::abs>(

                                   m_Specific(U.get()), m_ConstantInt(Flag))) &&

                         (!cast<Instruction>(U.get())->hasNoSignedWrap() ||

                          Flag->isOne());

                })) ||

           (BO->getOpcode() == Instruction::FSub &&

            !ValWithUses->hasNUsesOrMore(UsesLimit) &&

            all_of(ValWithUses->uses(), [](const Use &U) {

              return match(U.getUser(),

                           m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));

            }));

  return I->isCommutative();

}


/// This is a helper function to check whether \p I is commutative.

/// This is a convenience wrapper that calls the two-parameter version of

/// isCommutative with the same instruction for both parameters. This is

/// the common case where the instruction being checked for commutativity

/// is the same as the instruction whose uses are analyzed for special

/// patterns (see the two-parameter version above for details).

/// \param I The instruction to check for commutativity

/// \returns true if the instruction is commutative, false otherwise

static bool isCommutative(Instruction *I) { return isCommutative(I, I); }


/// \returns number of operands of \p I, considering commutativity. Returns 2

/// for commutative instrinsics.

/// \param I The instruction to check for commutativity

static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {

  if (isa<IntrinsicInst>(I) && isCommutative(I)) {

    // IntrinsicInst::isCommutative returns true if swapping the first "two"

    // arguments to the intrinsic produces the same result.

    constexpr unsigned IntrinsicNumOperands = 2;

    return IntrinsicNumOperands;

  }

  return I->getNumOperands();

}


template <typename T>

static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,

                                                     unsigned Offset) {

  static_assert(std::is_same_v<T, InsertElementInst> ||

                    std::is_same_v<T, ExtractElementInst>,

                "unsupported T");

  int Index = Offset;

  if (const auto *IE = dyn_cast<T>(Inst)) {

    const auto *VT = dyn_cast<FixedVectorType>(IE->getType());

    if (!VT)

      return std::nullopt;

    const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));

    if (!CI)

      return std::nullopt;

    if (CI->getValue().uge(VT->getNumElements()))

      return std::nullopt;

    Index *= VT->getNumElements();

    Index += CI->getZExtValue();

    return Index;

  }

  return std::nullopt;

}


/// \returns inserting or extracting index of InsertElement, ExtractElement or

/// InsertValue instruction, using Offset as base offset for index.

/// \returns std::nullopt if the index is not an immediate.

static std::optional<unsigned> getElementIndex(const Value *Inst,

                                               unsigned Offset = 0) {

  if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))

    return Index;

  if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))

    return Index;


  int Index = Offset;


  const auto *IV = dyn_cast<InsertValueInst>(Inst);

  if (!IV)

    return std::nullopt;


  Type *CurrentType = IV->getType();

  for (unsigned I : IV->indices()) {

    if (const auto *ST = dyn_cast<StructType>(CurrentType)) {

      Index *= ST->getNumElements();

      CurrentType = ST->getElementType(I);

    } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {

      Index *= AT->getNumElements();

      CurrentType = AT->getElementType();

    } else {

      return std::nullopt;

    }

    Index += I;

  }

  return Index;

}


/// \returns true if all of the values in \p VL use the same opcode.

/// For comparison instructions, also checks if predicates match.

/// PoisonValues are considered matching.

/// Interchangeable instructions are not considered.

static bool allSameOpcode(ArrayRef<Value *> VL) {

  auto *It = find_if(VL, IsaPred<Instruction>);

  if (It == VL.end())

    return true;

  Instruction *MainOp = cast<Instruction>(*It);

  unsigned Opcode = MainOp->getOpcode();

  bool IsCmpOp = isa<CmpInst>(MainOp);

  CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()

                                        : CmpInst::BAD_ICMP_PREDICATE;

  return std::all_of(It, VL.end(), [&](Value *V) {

    if (auto *CI = dyn_cast<CmpInst>(V))

      return BasePred == CI->getPredicate();

    if (auto *I = dyn_cast<Instruction>(V))

      return I->getOpcode() == Opcode;

    return isa<PoisonValue>(V);

  });

}


namespace {

/// Specifies the way the mask should be analyzed for undefs/poisonous elements

/// in the shuffle mask.

enum class UseMask {

  FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,

            ///< check for the mask elements for the first argument (mask

            ///< indices are in range [0:VF)).

  SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check

             ///< for the mask elements for the second argument (mask indices

             ///< are in range [VF:2*VF))

  UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for

               ///< future shuffle elements and mark them as ones as being used

               ///< in future. Non-undef elements are considered as unused since

               ///< they're already marked as used in the mask.

};

} // namespace


/// Prepares a use bitset for the given mask either for the first argument or

/// for the second.

static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,

                                   UseMask MaskArg) {

  SmallBitVector UseMask(VF, true);

  for (auto [Idx, Value] : enumerate(Mask)) {

    if (Value == PoisonMaskElem) {

      if (MaskArg == UseMask::UndefsAsMask)

        UseMask.reset(Idx);

      continue;

    }

    if (MaskArg == UseMask::FirstArg && Value < VF)

      UseMask.reset(Value);

    else if (MaskArg == UseMask::SecondArg && Value >= VF)

      UseMask.reset(Value - VF);

  }

  return UseMask;

}


/// Checks if the given value is actually an undefined constant vector.

/// Also, if the \p UseMask is not empty, tries to check if the non-masked

/// elements actually mask the insertelement buildvector, if any.

template <bool IsPoisonOnly = false>

static SmallBitVector isUndefVector(const Value *V,

                                    const SmallBitVector &UseMask = {}) {

  SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);

  using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;

  if (isa<T>(V))

    return Res;

  auto *VecTy = dyn_cast<FixedVectorType>(V->getType());

  if (!VecTy)

    return Res.reset();

  auto *C = dyn_cast<Constant>(V);

  if (!C) {

    if (!UseMask.empty()) {

      const Value *Base = V;

      while (auto *II = dyn_cast<InsertElementInst>(Base)) {

        Base = II->getOperand(0);

        if (isa<T>(II->getOperand(1)))

          continue;

        std::optional<unsigned> Idx = getElementIndex(II);

        if (!Idx) {

          Res.reset();

          return Res;

        }

        if (*Idx < UseMask.size() && !UseMask.test(*Idx))

          Res.reset(*Idx);

      }

      // TODO: Add analysis for shuffles here too.

      if (V == Base) {

        Res.reset();

      } else {

        SmallBitVector SubMask(UseMask.size(), false);

        Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);

      }

    } else {

      Res.reset();

    }

    return Res;

  }

  for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {

    if (Constant *Elem = C->getAggregateElement(I))

      if (!isa<T>(Elem) &&

          (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))

        Res.reset(I);

  }

  return Res;

}


/// Checks if the vector of instructions can be represented as a shuffle, like:

/// %x0 = extractelement <4 x i8> %x, i32 0

/// %x3 = extractelement <4 x i8> %x, i32 3

/// %y1 = extractelement <4 x i8> %y, i32 1

/// %y2 = extractelement <4 x i8> %y, i32 2

/// %x0x0 = mul i8 %x0, %x0

/// %x3x3 = mul i8 %x3, %x3

/// %y1y1 = mul i8 %y1, %y1

/// %y2y2 = mul i8 %y2, %y2

/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0

/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1

/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2

/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3

/// ret <4 x i8> %ins4

/// can be transformed into:

/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,

///                                                         i32 6>

/// %2 = mul <4 x i8> %1, %1

/// ret <4 x i8> %2

/// Mask will return the Shuffle Mask equivalent to the extracted elements.

/// TODO: Can we split off and reuse the shuffle mask detection from

/// ShuffleVectorInst/getShuffleCost?

static std::optional<TargetTransformInfo::ShuffleKind>

isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,

                     AssumptionCache *AC) {

  const auto *It = find_if(VL, IsaPred<ExtractElementInst>);

  if (It == VL.end())

    return std::nullopt;

  unsigned Size =

      std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {

        auto *EI = dyn_cast<ExtractElementInst>(V);

        if (!EI)

          return S;

        auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

        if (!VTy)

          return S;

        return std::max(S, VTy->getNumElements());

      });


  Value *Vec1 = nullptr;

  Value *Vec2 = nullptr;

  bool HasNonUndefVec = any_of(VL, [&](Value *V) {

    auto *EE = dyn_cast<ExtractElementInst>(V);

    if (!EE)

      return false;

    Value *Vec = EE->getVectorOperand();

    if (isa<UndefValue>(Vec))

      return false;

    return isGuaranteedNotToBePoison(Vec, AC);

  });

  enum ShuffleMode { Unknown, Select, Permute };

  ShuffleMode CommonShuffleMode = Unknown;

  Mask.assign(VL.size(), PoisonMaskElem);

  for (unsigned I = 0, E = VL.size(); I < E; ++I) {

    // Undef can be represented as an undef element in a vector.

    if (isa<UndefValue>(VL[I]))

      continue;

    auto *EI = cast<ExtractElementInst>(VL[I]);

    if (isa<ScalableVectorType>(EI->getVectorOperandType()))

      return std::nullopt;

    auto *Vec = EI->getVectorOperand();

    // We can extractelement from undef or poison vector.

    if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())

      continue;

    // All vector operands must have the same number of vector elements.

    if (isa<UndefValue>(Vec)) {

      Mask[I] = I;

    } else {

      if (isa<UndefValue>(EI->getIndexOperand()))

        continue;

      auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());

      if (!Idx)

        return std::nullopt;

      // Undefined behavior if Idx is negative or >= Size.

      if (Idx->getValue().uge(Size))

        continue;

      unsigned IntIdx = Idx->getValue().getZExtValue();

      Mask[I] = IntIdx;

    }

    if (isUndefVector(Vec).all() && HasNonUndefVec)

      continue;

    // For correct shuffling we have to have at most 2 different vector operands

    // in all extractelement instructions.

    if (!Vec1 || Vec1 == Vec) {

      Vec1 = Vec;

    } else if (!Vec2 || Vec2 == Vec) {

      Vec2 = Vec;

      Mask[I] += Size;

    } else {

      return std::nullopt;

    }

    if (CommonShuffleMode == Permute)

      continue;

    // If the extract index is not the same as the operation number, it is a

    // permutation.

    if (Mask[I] % Size != I) {

      CommonShuffleMode = Permute;

      continue;

    }

    CommonShuffleMode = Select;

  }

  // If we're not crossing lanes in different vectors, consider it as blending.

  if (CommonShuffleMode == Select && Vec2)

    return TargetTransformInfo::SK_Select;

  // If Vec2 was never used, we have a permutation of a single vector, otherwise

  // we have permutation of 2 vectors.

  return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc

              : TargetTransformInfo::SK_PermuteSingleSrc;

}


/// \returns True if Extract{Value,Element} instruction extracts element Idx.

static std::optional<unsigned> getExtractIndex(const Instruction *E) {

  unsigned Opcode = E->getOpcode();

  assert((Opcode == Instruction::ExtractElement ||

          Opcode == Instruction::ExtractValue) &&

         "Expected extractelement or extractvalue instruction.");

  if (Opcode == Instruction::ExtractElement) {

    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));

    if (!CI)

      return std::nullopt;

    return CI->getZExtValue();

  }

  auto *EI = cast<ExtractValueInst>(E);

  if (EI->getNumIndices() != 1)

    return std::nullopt;

  return *EI->idx_begin();

}


namespace llvm {

/// Checks if the provided value does not require scheduling. It does not

/// require scheduling if this is not an instruction or it is an instruction

/// that does not read/write memory and all operands are either not instructions

/// or phi nodes or instructions from different blocks.

static bool areAllOperandsNonInsts(Value *V);

/// Checks if the provided value does not require scheduling. It does not

/// require scheduling if this is not an instruction or it is an instruction

/// that does not read/write memory and all users are phi nodes or instructions

/// from the different blocks.

static bool isUsedOutsideBlock(Value *V);

/// Checks if the specified value does not require scheduling. It does not

/// require scheduling if all operands and all users do not need to be scheduled

/// in the current basic block.

static bool doesNotNeedToBeScheduled(Value *V);

} // namespace llvm


namespace {

/// \returns true if \p Opcode is allowed as part of the main/alternate

/// instruction for SLP vectorization.

///

/// Example of unsupported opcode is SDIV that can potentially cause UB if the

/// "shuffled out" lane would result in division by zero.

bool isValidForAlternation(unsigned Opcode) {

  return !Instruction::isIntDivRem(Opcode);

}


/// Helper class that determines VL can use the same opcode.

/// Alternate instruction is supported. In addition, it supports interchangeable

/// instruction. An interchangeable instruction is an instruction that can be

/// converted to another instruction with same semantics. For example, x << 1 is

/// equal to x * 2. x * 1 is equal to x | 0.

class BinOpSameOpcodeHelper {

  using MaskType = std::uint_fast16_t;

  /// Sort SupportedOp because it is used by binary_search.

  constexpr static std::initializer_list<unsigned> SupportedOp = {

      Instruction::Add,  Instruction::Sub, Instruction::Mul, Instruction::Shl,

      Instruction::AShr, Instruction::And, Instruction::Or,  Instruction::Xor};

  enum : MaskType {

    ShlBIT = 0b1,

    AShrBIT = 0b10,

    MulBIT = 0b100,

    AddBIT = 0b1000,

    SubBIT = 0b10000,

    AndBIT = 0b100000,

    OrBIT = 0b1000000,

    XorBIT = 0b10000000,

    MainOpBIT = 0b100000000,

    LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)

  };

  /// Return a non-nullptr if either operand of I is a ConstantInt.

  /// The second return value represents the operand position. We check the

  /// right-hand side first (1). If the right hand side is not a ConstantInt and

  /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand

  /// side (0).

  static std::pair<ConstantInt *, unsigned>

  isBinOpWithConstantInt(const Instruction *I) {

    unsigned Opcode = I->getOpcode();

    assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");

    (void)SupportedOp;

    auto *BinOp = cast<BinaryOperator>(I);

    if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))

      return {CI, 1};

    if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||

        Opcode == Instruction::AShr)

      return {nullptr, 0};

    if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))

      return {CI, 0};

    return {nullptr, 0};

  }

  struct InterchangeableInfo {

    const Instruction *I = nullptr;

    /// The bit it sets represents whether MainOp can be converted to.

    MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |

                    MulBIT | AShrBIT | ShlBIT;

    /// We cannot create an interchangeable instruction that does not exist in

    /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],

    /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *

    /// 1]. SeenBefore is used to know what operations have been seen before.

    MaskType SeenBefore = 0;

    InterchangeableInfo(const Instruction *I) : I(I) {}

    /// Return false allows BinOpSameOpcodeHelper to find an alternate

    /// instruction. Directly setting the mask will destroy the mask state,

    /// preventing us from determining which instruction it should convert to.

    bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {

      if (Mask & InterchangeableMask) {

        SeenBefore |= OpcodeInMaskForm;

        Mask &= InterchangeableMask;

        return true;

      }

      return false;

    }

    bool equal(unsigned Opcode) {

      if (Opcode == I->getOpcode())

        return trySet(MainOpBIT, MainOpBIT);

      return false;

    }

    unsigned getOpcode() const {

      MaskType Candidate = Mask & SeenBefore;

      if (Candidate & MainOpBIT)

        return I->getOpcode();

      if (Candidate & ShlBIT)

        return Instruction::Shl;

      if (Candidate & AShrBIT)

        return Instruction::AShr;

      if (Candidate & MulBIT)

        return Instruction::Mul;

      if (Candidate & AddBIT)

        return Instruction::Add;

      if (Candidate & SubBIT)

        return Instruction::Sub;

      if (Candidate & AndBIT)

        return Instruction::And;

      if (Candidate & OrBIT)

        return Instruction::Or;

      if (Candidate & XorBIT)

        return Instruction::Xor;

      llvm_unreachable("Cannot find interchangeable instruction.");

    }


    /// Return true if the instruction can be converted to \p Opcode.

    bool hasCandidateOpcode(unsigned Opcode) const {

      MaskType Candidate = Mask & SeenBefore;

      switch (Opcode) {

      case Instruction::Shl:

        return Candidate & ShlBIT;

      case Instruction::AShr:

        return Candidate & AShrBIT;

      case Instruction::Mul:

        return Candidate & MulBIT;

      case Instruction::Add:

        return Candidate & AddBIT;

      case Instruction::Sub:

        return Candidate & SubBIT;

      case Instruction::And:

        return Candidate & AndBIT;

      case Instruction::Or:

        return Candidate & OrBIT;

      case Instruction::Xor:

        return Candidate & XorBIT;

      case Instruction::LShr:

      case Instruction::FAdd:

      case Instruction::FSub:

      case Instruction::FMul:

      case Instruction::SDiv:

      case Instruction::UDiv:

      case Instruction::FDiv:

      case Instruction::SRem:

      case Instruction::URem:

      case Instruction::FRem:

        return false;

      default:

        break;

      }

      llvm_unreachable("Cannot find interchangeable instruction.");

    }


    SmallVector<Value *> getOperand(const Instruction *To) const {

      unsigned ToOpcode = To->getOpcode();

      unsigned FromOpcode = I->getOpcode();

      if (FromOpcode == ToOpcode)

        return SmallVector<Value *>(I->operands());

      assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");

      auto [CI, Pos] = isBinOpWithConstantInt(I);

      const APInt &FromCIValue = CI->getValue();

      unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();

      APInt ToCIValue;

      switch (FromOpcode) {

      case Instruction::Shl:

        if (ToOpcode == Instruction::Mul) {

          ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,

                                          FromCIValue.getZExtValue());

        } else {

          assert(FromCIValue.isZero() && "Cannot convert the instruction.");

          ToCIValue = ToOpcode == Instruction::And

                          ? APInt::getAllOnes(FromCIValueBitWidth)

                          : APInt::getZero(FromCIValueBitWidth);

        }

        break;

      case Instruction::Mul:

        assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");

        if (ToOpcode == Instruction::Shl) {

          ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());

        } else {

          assert(FromCIValue.isOne() && "Cannot convert the instruction.");

          ToCIValue = ToOpcode == Instruction::And

                          ? APInt::getAllOnes(FromCIValueBitWidth)

                          : APInt::getZero(FromCIValueBitWidth);

        }

        break;

      case Instruction::Add:

      case Instruction::Sub:

        if (FromCIValue.isZero()) {

          ToCIValue = APInt::getZero(FromCIValueBitWidth);

        } else {

          assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&

                 "Cannot convert the instruction.");

          ToCIValue = FromCIValue;

          ToCIValue.negate();

        }

        break;

      case Instruction::And:

        assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");

        ToCIValue = ToOpcode == Instruction::Mul

                        ? APInt::getOneBitSet(FromCIValueBitWidth, 0)

                        : APInt::getZero(FromCIValueBitWidth);

        break;

      default:

        assert(FromCIValue.isZero() && "Cannot convert the instruction.");

        ToCIValue = APInt::getZero(FromCIValueBitWidth);

        break;

      }

      Value *LHS = I->getOperand(1 - Pos);

      Constant *RHS =

          ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);

      // constant + x cannot be -constant - x

      // instead, it should be x - -constant

      if (Pos == 1 ||

          (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))

        return SmallVector<Value *>({LHS, RHS});

      return SmallVector<Value *>({RHS, LHS});

    }

  };

  InterchangeableInfo MainOp;

  InterchangeableInfo AltOp;

  bool isValidForAlternation(const Instruction *I) const {

    return ::isValidForAlternation(MainOp.I->getOpcode()) &&

           ::isValidForAlternation(I->getOpcode());

  }

  bool initializeAltOp(const Instruction *I) {

    if (AltOp.I)

      return true;

    if (!isValidForAlternation(I))

      return false;

    AltOp.I = I;

    return true;

  }


public:

  BinOpSameOpcodeHelper(const Instruction *MainOp,

                        const Instruction *AltOp = nullptr)

      : MainOp(MainOp), AltOp(AltOp) {

    assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");

  }

  bool add(const Instruction *I) {

    assert(isa<BinaryOperator>(I) &&

           "BinOpSameOpcodeHelper only accepts BinaryOperator.");

    unsigned Opcode = I->getOpcode();

    MaskType OpcodeInMaskForm;

    // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.

    switch (Opcode) {

    case Instruction::Shl:

      OpcodeInMaskForm = ShlBIT;

      break;

    case Instruction::AShr:

      OpcodeInMaskForm = AShrBIT;

      break;

    case Instruction::Mul:

      OpcodeInMaskForm = MulBIT;

      break;

    case Instruction::Add:

      OpcodeInMaskForm = AddBIT;

      break;

    case Instruction::Sub:

      OpcodeInMaskForm = SubBIT;

      break;

    case Instruction::And:

      OpcodeInMaskForm = AndBIT;

      break;

    case Instruction::Or:

      OpcodeInMaskForm = OrBIT;

      break;

    case Instruction::Xor:

      OpcodeInMaskForm = XorBIT;

      break;

    default:

      return MainOp.equal(Opcode) ||

             (initializeAltOp(I) && AltOp.equal(Opcode));

    }

    MaskType InterchangeableMask = OpcodeInMaskForm;

    ConstantInt *CI = isBinOpWithConstantInt(I).first;

    if (CI) {

      constexpr MaskType CanBeAll =

          XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;

      const APInt &CIValue = CI->getValue();

      switch (Opcode) {

      case Instruction::Shl:

        if (CIValue.ult(CIValue.getBitWidth()))

          InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;

        break;

      case Instruction::Mul:

        if (CIValue.isOne()) {

          InterchangeableMask = CanBeAll;

          break;

        }

        if (CIValue.isPowerOf2())

          InterchangeableMask = MulBIT | ShlBIT;

        break;

      case Instruction::Add:

      case Instruction::Sub:

        InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;

        break;

      case Instruction::And:

        if (CIValue.isAllOnes())

          InterchangeableMask = CanBeAll;

        break;

      default:

        if (CIValue.isZero())

          InterchangeableMask = CanBeAll;

        break;

      }

    }

    return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||

           (initializeAltOp(I) &&

            AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));

  }

  unsigned getMainOpcode() const { return MainOp.getOpcode(); }

  /// Checks if the list of potential opcodes includes \p Opcode.

  bool hasCandidateOpcode(unsigned Opcode) const {

    return MainOp.hasCandidateOpcode(Opcode);

  }

  bool hasAltOp() const { return AltOp.I; }

  unsigned getAltOpcode() const {

    return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();

  }

  SmallVector<Value *> getOperand(const Instruction *I) const {

    return MainOp.getOperand(I);

  }

};


/// Main data required for vectorization of instructions.

class InstructionsState {

  /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,

  /// only BinaryOperator, CastInst, and CmpInst support alternate instructions

  /// (i.e., AltOp is not equal to MainOp; this can be checked using

  /// isAltShuffle).

  /// A rare exception is TrySplitNode, where the InstructionsState is derived

  /// from getMainAltOpsNoStateVL.

  /// For those InstructionsState that use alternate instructions, the resulting

  /// vectorized output ultimately comes from a shufflevector. For example,

  /// given a vector list (VL):

  /// VL[0] = add i32 a, e

  /// VL[1] = sub i32 b, f

  /// VL[2] = add i32 c, g

  /// VL[3] = sub i32 d, h

  /// The vectorized result would be:

  /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>

  /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>

  /// result = shufflevector <4 x i32> intermediated_0,

  ///                        <4 x i32> intermediated_1,

  ///                        <4 x i32> <i32 0, i32 5, i32 2, i32 7>

  /// Since shufflevector is used in the final result, when calculating the cost

  /// (getEntryCost), we must account for the usage of shufflevector in

  /// GetVectorCost.

  Instruction *MainOp = nullptr;

  Instruction *AltOp = nullptr;

  /// Wether the instruction state represents copyable instructions.

  bool HasCopyables = false;


public:

  Instruction *getMainOp() const {

    assert(valid() && "InstructionsState is invalid.");

    return MainOp;

  }


  Instruction *getAltOp() const {

    assert(valid() && "InstructionsState is invalid.");

    return AltOp;

  }


  /// The main/alternate opcodes for the list of instructions.

  unsigned getOpcode() const { return getMainOp()->getOpcode(); }


  unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }


  /// Some of the instructions in the list have alternate opcodes.

  bool isAltShuffle() const { return getMainOp() != getAltOp(); }


  /// Checks if the instruction matches either the main or alternate opcode.

  /// \returns

  /// - MainOp if \param I matches MainOp's opcode directly or can be converted

  /// to it

  /// - AltOp if \param I matches AltOp's opcode directly or can be converted to

  /// it

  /// - nullptr if \param I cannot be matched or converted to either opcode

  Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {

    assert(MainOp && "MainOp cannot be nullptr.");

    if (I->getOpcode() == MainOp->getOpcode())

      return MainOp;

    // Prefer AltOp instead of interchangeable instruction of MainOp.

    assert(AltOp && "AltOp cannot be nullptr.");

    if (I->getOpcode() == AltOp->getOpcode())

      return AltOp;

    if (!I->isBinaryOp())

      return nullptr;

    BinOpSameOpcodeHelper Converter(MainOp);

    if (!Converter.add(I) || !Converter.add(MainOp))

      return nullptr;

    if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {

      BinOpSameOpcodeHelper AltConverter(AltOp);

      if (AltConverter.add(I) && AltConverter.add(AltOp) &&

          AltConverter.hasCandidateOpcode(AltOp->getOpcode()))

        return AltOp;

    }

    if (Converter.hasAltOp() && !isAltShuffle())

      return nullptr;

    return Converter.hasAltOp() ? AltOp : MainOp;

  }


  /// Checks if main/alt instructions are shift operations.

  bool isShiftOp() const {

    return getMainOp()->isShift() && getAltOp()->isShift();

  }


  /// Checks if main/alt instructions are bitwise logic operations.

  bool isBitwiseLogicOp() const {

    return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();

  }


  /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.

  bool isMulDivLikeOp() const {

    constexpr std::array<unsigned, 8> MulDiv = {

        Instruction::Mul,  Instruction::FMul, Instruction::SDiv,

        Instruction::UDiv, Instruction::FDiv, Instruction::SRem,

        Instruction::URem, Instruction::FRem};

    return is_contained(MulDiv, getOpcode()) &&

           is_contained(MulDiv, getAltOpcode());

  }


  /// Checks if main/alt instructions are add/sub/fadd/fsub operations.

  bool isAddSubLikeOp() const {

    constexpr std::array<unsigned, 4> AddSub = {

        Instruction::Add, Instruction::Sub, Instruction::FAdd,

        Instruction::FSub};

    return is_contained(AddSub, getOpcode()) &&

           is_contained(AddSub, getAltOpcode());

  }


  /// Checks if main/alt instructions are cmp operations.

  bool isCmpOp() const {

    return (getOpcode() == Instruction::ICmp ||

            getOpcode() == Instruction::FCmp) &&

           getAltOpcode() == getOpcode();

  }


  /// Checks if the current state is valid, i.e. has non-null MainOp

  bool valid() const { return MainOp && AltOp; }


  explicit operator bool() const { return valid(); }


  InstructionsState() = delete;

  InstructionsState(Instruction *MainOp, Instruction *AltOp,

                    bool HasCopyables = false)

      : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}

  static InstructionsState invalid() { return {nullptr, nullptr}; }


  /// Checks if the value is a copyable element.

  bool isCopyableElement(Value *V) const {

    assert(valid() && "InstructionsState is invalid.");

    if (!HasCopyables)

      return false;

    if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)

      return false;

    auto *I = dyn_cast<Instruction>(V);

    if (!I)

      return !isa<PoisonValue>(V);

    if (I->getParent() != MainOp->getParent() &&

        (!isVectorLikeInstWithConstOps(I) ||

         !isVectorLikeInstWithConstOps(MainOp)))

      return true;

    if (I->getOpcode() == MainOp->getOpcode())

      return false;

    if (!I->isBinaryOp())

      return true;

    BinOpSameOpcodeHelper Converter(MainOp);

    return !Converter.add(I) || !Converter.add(MainOp) ||

           Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());

  }


  /// Checks if the value is non-schedulable.

  bool isNonSchedulable(Value *V) const {

    assert(valid() && "InstructionsState is invalid.");

    auto *I = dyn_cast<Instruction>(V);

    if (!HasCopyables)

      return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||

             doesNotNeedToBeScheduled(V);

    // MainOp for copyables always schedulable to correctly identify

    // non-schedulable copyables.

    if (getMainOp() == V)

      return false;

    if (isCopyableElement(V)) {

      auto IsNonSchedulableCopyableElement = [this](Value *V) {

        auto *I = dyn_cast<Instruction>(V);

        return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||

               (doesNotNeedToBeScheduled(I) &&

                // If the copyable instructions comes after MainOp

                // (non-schedulable, but used in the block) - cannot vectorize

                // it, will possibly generate use before def.

                !MainOp->comesBefore(I));

      };


      return IsNonSchedulableCopyableElement(V);

    }

    return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||

           doesNotNeedToBeScheduled(V);

  }


  /// Checks if the state represents copyable instructions.

  bool areInstructionsWithCopyableElements() const {

    assert(valid() && "InstructionsState is invalid.");

    return HasCopyables;

  }

};


std::pair<Instruction *, SmallVector<Value *>>

convertTo(Instruction *I, const InstructionsState &S) {

  Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);

  assert(SelectedOp && "Cannot convert the instruction.");

  if (I->isBinaryOp()) {

    BinOpSameOpcodeHelper Converter(I);

    return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));

  }

  return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));

}


} // end anonymous namespace


static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

                                       const TargetLibraryInfo &TLI);


/// Find an instruction with a specific opcode in VL.

/// \param VL Array of values to search through. Must contain only Instructions

///           and PoisonValues.

/// \param Opcode The instruction opcode to search for

/// \returns

/// - The first instruction found with matching opcode

/// - nullptr if no matching instruction is found

static Instruction *findInstructionWithOpcode(ArrayRef<Value *> VL,

                                              unsigned Opcode) {

  for (Value *V : VL) {

    if (isa<PoisonValue>(V))

      continue;

    assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");

    auto *Inst = cast<Instruction>(V);

    if (Inst->getOpcode() == Opcode)

      return Inst;

  }

  return nullptr;

}


/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.

/// compatible instructions or constants, or just some other regular values.

static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,

                                Value *Op1, const TargetLibraryInfo &TLI) {

  return (isConstant(BaseOp0) && isConstant(Op0)) ||

         (isConstant(BaseOp1) && isConstant(Op1)) ||

         (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&

          !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||

         BaseOp0 == Op0 || BaseOp1 == Op1 ||

         getSameOpcode({BaseOp0, Op0}, TLI) ||

         getSameOpcode({BaseOp1, Op1}, TLI);

}


/// \returns true if a compare instruction \p CI has similar "look" and

/// same predicate as \p BaseCI, "as is" or with its operands and predicate

/// swapped, false otherwise.

static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,

                               const TargetLibraryInfo &TLI) {

  assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&

         "Assessing comparisons of different types?");

  CmpInst::Predicate BasePred = BaseCI->getPredicate();

  CmpInst::Predicate Pred = CI->getPredicate();

  CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);


  Value *BaseOp0 = BaseCI->getOperand(0);

  Value *BaseOp1 = BaseCI->getOperand(1);

  Value *Op0 = CI->getOperand(0);

  Value *Op1 = CI->getOperand(1);


  return (BasePred == Pred &&

          areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||

         (BasePred == SwappedPred &&

          areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));

}


/// \returns analysis of the Instructions in \p VL described in

/// InstructionsState, the Opcode that we suppose the whole list

/// could be vectorized even if its structure is diverse.

static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

                                       const TargetLibraryInfo &TLI) {

  // Make sure these are all Instructions.

  if (!all_of(VL, IsaPred<Instruction, PoisonValue>))

    return InstructionsState::invalid();


  auto *It = find_if(VL, IsaPred<Instruction>);

  if (It == VL.end())

    return InstructionsState::invalid();


  Instruction *MainOp = cast<Instruction>(*It);

  unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);

  if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||

      (VL.size() == 2 && InstCnt < 2))

    return InstructionsState::invalid();


  bool IsCastOp = isa<CastInst>(MainOp);

  bool IsBinOp = isa<BinaryOperator>(MainOp);

  bool IsCmpOp = isa<CmpInst>(MainOp);

  CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()

                                        : CmpInst::BAD_ICMP_PREDICATE;

  Instruction *AltOp = MainOp;

  unsigned Opcode = MainOp->getOpcode();

  unsigned AltOpcode = Opcode;


  BinOpSameOpcodeHelper BinOpHelper(MainOp);

  bool SwappedPredsCompatible = IsCmpOp && [&]() {

    SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;

    UniquePreds.insert(BasePred);

    UniqueNonSwappedPreds.insert(BasePred);

    for (Value *V : VL) {

      auto *I = dyn_cast<CmpInst>(V);

      if (!I)

        return false;

      CmpInst::Predicate CurrentPred = I->getPredicate();

      CmpInst::Predicate SwappedCurrentPred =

          CmpInst::getSwappedPredicate(CurrentPred);

      UniqueNonSwappedPreds.insert(CurrentPred);

      if (!UniquePreds.contains(CurrentPred) &&

          !UniquePreds.contains(SwappedCurrentPred))

        UniquePreds.insert(CurrentPred);

    }

    // Total number of predicates > 2, but if consider swapped predicates

    // compatible only 2, consider swappable predicates as compatible opcodes,

    // not alternate.

    return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;

  }();

  // Check for one alternate opcode from another BinaryOperator.

  // TODO - generalize to support all operators (types, calls etc.).

  Intrinsic::ID BaseID = 0;

  SmallVector<VFInfo> BaseMappings;

  if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {

    BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);

    BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);

    if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())

      return InstructionsState::invalid();

  }

  bool AnyPoison = InstCnt != VL.size();

  // Check MainOp too to be sure that it matches the requirements for the

  // instructions.

  for (Value *V : iterator_range(It, VL.end())) {

    auto *I = dyn_cast<Instruction>(V);

    if (!I)

      continue;


    // Cannot combine poison and divisions.

    // TODO: do some smart analysis of the CallInsts to exclude divide-like

    // intrinsics/functions only.

    if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))

      return InstructionsState::invalid();

    unsigned InstOpcode = I->getOpcode();

    if (IsBinOp && isa<BinaryOperator>(I)) {

      if (BinOpHelper.add(I))

        continue;

    } else if (IsCastOp && isa<CastInst>(I)) {

      Value *Op0 = MainOp->getOperand(0);

      Type *Ty0 = Op0->getType();

      Value *Op1 = I->getOperand(0);

      Type *Ty1 = Op1->getType();

      if (Ty0 == Ty1) {

        if (InstOpcode == Opcode || InstOpcode == AltOpcode)

          continue;

        if (Opcode == AltOpcode) {

          assert(isValidForAlternation(Opcode) &&

                 isValidForAlternation(InstOpcode) &&

                 "Cast isn't safe for alternation, logic needs to be updated!");

          AltOpcode = InstOpcode;

          AltOp = I;

          continue;

        }

      }

    } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {

      auto *BaseInst = cast<CmpInst>(MainOp);

      Type *Ty0 = BaseInst->getOperand(0)->getType();

      Type *Ty1 = Inst->getOperand(0)->getType();

      if (Ty0 == Ty1) {

        assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");

        assert(InstOpcode == AltOpcode &&

               "Alternate instructions are only supported by BinaryOperator "

               "and CastInst.");

        // Check for compatible operands. If the corresponding operands are not

        // compatible - need to perform alternate vectorization.

        CmpInst::Predicate CurrentPred = Inst->getPredicate();

        CmpInst::Predicate SwappedCurrentPred =

            CmpInst::getSwappedPredicate(CurrentPred);


        if ((VL.size() == 2 || SwappedPredsCompatible) &&

            (BasePred == CurrentPred || BasePred == SwappedCurrentPred))

          continue;


        if (isCmpSameOrSwapped(BaseInst, Inst, TLI))

          continue;

        auto *AltInst = cast<CmpInst>(AltOp);

        if (MainOp != AltOp) {

          if (isCmpSameOrSwapped(AltInst, Inst, TLI))

            continue;

        } else if (BasePred != CurrentPred) {

          assert(

              isValidForAlternation(InstOpcode) &&

              "CmpInst isn't safe for alternation, logic needs to be updated!");

          AltOp = I;

          continue;

        }

        CmpInst::Predicate AltPred = AltInst->getPredicate();

        if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||

            AltPred == CurrentPred || AltPred == SwappedCurrentPred)

          continue;

      }

    } else if (InstOpcode == Opcode) {

      assert(InstOpcode == AltOpcode &&

             "Alternate instructions are only supported by BinaryOperator and "

             "CastInst.");

      if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

        if (Gep->getNumOperands() != 2 ||

            Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())

          return InstructionsState::invalid();

      } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {

        if (!isVectorLikeInstWithConstOps(EI))

          return InstructionsState::invalid();

      } else if (auto *LI = dyn_cast<LoadInst>(I)) {

        auto *BaseLI = cast<LoadInst>(MainOp);

        if (!LI->isSimple() || !BaseLI->isSimple())

          return InstructionsState::invalid();

      } else if (auto *Call = dyn_cast<CallInst>(I)) {

        auto *CallBase = cast<CallInst>(MainOp);

        if (Call->getCalledFunction() != CallBase->getCalledFunction())

          return InstructionsState::invalid();

        if (Call->hasOperandBundles() &&

            (!CallBase->hasOperandBundles() ||

             !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),

                         Call->op_begin() + Call->getBundleOperandsEndIndex(),

                         CallBase->op_begin() +

                             CallBase->getBundleOperandsStartIndex())))

          return InstructionsState::invalid();

        Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);

        if (ID != BaseID)

          return InstructionsState::invalid();

        if (!ID) {

          SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);

          if (Mappings.size() != BaseMappings.size() ||

              Mappings.front().ISA != BaseMappings.front().ISA ||

              Mappings.front().ScalarName != BaseMappings.front().ScalarName ||

              Mappings.front().VectorName != BaseMappings.front().VectorName ||

              Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||

              Mappings.front().Shape.Parameters !=

                  BaseMappings.front().Shape.Parameters)

            return InstructionsState::invalid();

        }

      }

      continue;

    }

    return InstructionsState::invalid();

  }


  if (IsBinOp) {

    MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());

    assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");

    AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());

    assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");

  }

  assert((MainOp == AltOp || !allSameOpcode(VL)) &&

         "Incorrect implementation of allSameOpcode.");

  InstructionsState S(MainOp, AltOp);

  assert(all_of(VL,

                [&](Value *V) {

                  return isa<PoisonValue>(V) ||

                         S.getMatchingMainOpOrAltOp(cast<Instruction>(V));

                }) &&

         "Invalid InstructionsState.");

  return S;

}


/// \returns true if all of the values in \p VL have the same type or false

/// otherwise.

static bool allSameType(ArrayRef<Value *> VL) {

  Type *Ty = VL.consume_front()->getType();

  return all_of(VL, [&](Value *V) { return V->getType() == Ty; });

}


/// \returns True if in-tree use also needs extract. This refers to

/// possible scalar operand in vectorized instruction.

static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

                                        TargetLibraryInfo *TLI,

                                        const TargetTransformInfo *TTI) {

  if (!UserInst)

    return false;

  unsigned Opcode = UserInst->getOpcode();

  switch (Opcode) {

  case Instruction::Load: {

    LoadInst *LI = cast<LoadInst>(UserInst);

    return (LI->getPointerOperand() == Scalar);

  }

  case Instruction::Store: {

    StoreInst *SI = cast<StoreInst>(UserInst);

    return (SI->getPointerOperand() == Scalar);

  }

  case Instruction::Call: {

    CallInst *CI = cast<CallInst>(UserInst);

    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

    return any_of(enumerate(CI->args()), [&](auto &&Arg) {

      return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&

             Arg.value().get() == Scalar;

    });

  }

  default:

    return false;

  }

}


/// \returns the AA location that is being access by the instruction.

static MemoryLocation getLocation(Instruction *I) {

  if (StoreInst *SI = dyn_cast<StoreInst>(I))

    return MemoryLocation::get(SI);

  if (LoadInst *LI = dyn_cast<LoadInst>(I))

    return MemoryLocation::get(LI);

  return MemoryLocation();

}


/// \returns True if the instruction is not a volatile or atomic load/store.

static bool isSimple(Instruction *I) {

  if (LoadInst *LI = dyn_cast<LoadInst>(I))

    return LI->isSimple();

  if (StoreInst *SI = dyn_cast<StoreInst>(I))

    return SI->isSimple();

  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))

    return !MI->isVolatile();

  return true;

}


/// Shuffles \p Mask in accordance with the given \p SubMask.

/// \param ExtendingManyInputs Supports reshuffling of the mask with not only

/// one but two input vectors.

static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,

                    bool ExtendingManyInputs = false) {

  if (SubMask.empty())

    return;

  assert(

      (!ExtendingManyInputs || SubMask.size() > Mask.size() ||

       // Check if input scalars were extended to match the size of other node.

       (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&

      "SubMask with many inputs support must be larger than the mask.");

  if (Mask.empty()) {

    Mask.append(SubMask.begin(), SubMask.end());

    return;

  }

  SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);

  int TermValue = std::min(Mask.size(), SubMask.size());

  for (int I = 0, E = SubMask.size(); I < E; ++I) {

    if (SubMask[I] == PoisonMaskElem ||

        (!ExtendingManyInputs &&

         (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))

      continue;

    NewMask[I] = Mask[SubMask[I]];

  }

  Mask.swap(NewMask);

}


/// Order may have elements assigned special value (size) which is out of

/// bounds. Such indices only appear on places which correspond to undef values

/// (see canReuseExtract for details) and used in order to avoid undef values

/// have effect on operands ordering.

/// The first loop below simply finds all unused indices and then the next loop

/// nest assigns these indices for undef values positions.

/// As an example below Order has two undef positions and they have assigned

/// values 3 and 7 respectively:

/// before:  6 9 5 4 9 2 1 0

/// after:   6 3 5 4 7 2 1 0

static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {

  const size_t Sz = Order.size();

  SmallBitVector UnusedIndices(Sz, /*t=*/true);

  SmallBitVector MaskedIndices(Sz);

  for (unsigned I = 0; I < Sz; ++I) {

    if (Order[I] < Sz)

      UnusedIndices.reset(Order[I]);

    else

      MaskedIndices.set(I);

  }

  if (MaskedIndices.none())

    return;

  assert(UnusedIndices.count() == MaskedIndices.count() &&

         "Non-synced masked/available indices.");

  int Idx = UnusedIndices.find_first();

  int MIdx = MaskedIndices.find_first();

  while (MIdx >= 0) {

    assert(Idx >= 0 && "Indices must be synced.");

    Order[MIdx] = Idx;

    Idx = UnusedIndices.find_next(Idx);

    MIdx = MaskedIndices.find_next(MIdx);

  }

}


/// \returns a bitset for selecting opcodes. false for Opcode0 and true for

/// Opcode1.

static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, Type *ScalarTy,

                                      unsigned Opcode0, unsigned Opcode1) {

  unsigned ScalarTyNumElements = getNumElements(ScalarTy);

  SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);

  for (unsigned Lane : seq<unsigned>(VL.size())) {

    if (isa<PoisonValue>(VL[Lane]))

      continue;

    if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)

      OpcodeMask.set(Lane * ScalarTyNumElements,

                     Lane * ScalarTyNumElements + ScalarTyNumElements);

  }

  return OpcodeMask;

}


/// Replicates the given \p Val \p VF times.

static SmallVector<Constant *> replicateMask(ArrayRef<Constant *> Val,

                                             unsigned VF) {

  assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&

         "Expected scalar constants.");

  SmallVector<Constant *> NewVal(Val.size() * VF);

  for (auto [I, V] : enumerate(Val))

    std::fill_n(NewVal.begin() + I * VF, VF, V);

  return NewVal;

}


namespace llvm {


static void inversePermutation(ArrayRef<unsigned> Indices,

                               SmallVectorImpl<int> &Mask) {

  Mask.clear();

  const unsigned E = Indices.size();

  Mask.resize(E, PoisonMaskElem);

  for (unsigned I = 0; I < E; ++I)

    Mask[Indices[I]] = I;

}


/// Reorders the list of scalars in accordance with the given \p Mask.

static void reorderScalars(SmallVectorImpl<Value *> &Scalars,

                           ArrayRef<int> Mask) {

  assert(!Mask.empty() && "Expected non-empty mask.");

  SmallVector<Value *> Prev(Scalars.size(),

                            PoisonValue::get(Scalars.front()->getType()));

  Prev.swap(Scalars);

  for (unsigned I = 0, E = Prev.size(); I < E; ++I)

    if (Mask[I] != PoisonMaskElem)

      Scalars[Mask[I]] = Prev[I];

}


/// Checks if the provided value does not require scheduling. It does not

/// require scheduling if this is not an instruction or it is an instruction

/// that does not read/write memory and all operands are either not instructions

/// or phi nodes or instructions from different blocks.

static bool areAllOperandsNonInsts(Value *V) {

  auto *I = dyn_cast<Instruction>(V);

  if (!I)

    return true;

  return !mayHaveNonDefUseDependency(*I) &&

    all_of(I->operands(), [I](Value *V) {

      auto *IO = dyn_cast<Instruction>(V);

      if (!IO)

        return true;

      return isa<PHINode>(IO) || IO->getParent() != I->getParent();

    });

}


/// Checks if the provided value does not require scheduling. It does not

/// require scheduling if this is not an instruction or it is an instruction

/// that does not read/write memory and all users are phi nodes or instructions

/// from the different blocks.

static bool isUsedOutsideBlock(Value *V) {

  auto *I = dyn_cast<Instruction>(V);

  if (!I)

    return true;

  // Limits the number of uses to save compile time.

  return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&

         all_of(I->users(), [I](User *U) {

           auto *IU = dyn_cast<Instruction>(U);

           if (!IU)

             return true;

           return IU->getParent() != I->getParent() || isa<PHINode>(IU);

         });

}


/// Checks if the specified value does not require scheduling. It does not

/// require scheduling if all operands and all users do not need to be scheduled

/// in the current basic block.

static bool doesNotNeedToBeScheduled(Value *V) {

  return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);

}


/// Checks if the specified array of instructions does not require scheduling.

/// It is so if all either instructions have operands that do not require

/// scheduling or their users do not require scheduling since they are phis or

/// in other basic blocks.

static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {

  return !VL.empty() &&

         (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));

}


/// Returns true if widened type of \p Ty elements with size \p Sz represents

/// full vector type, i.e. adding extra element results in extra parts upon type

/// legalization.

static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,

                                     unsigned Sz) {

  if (Sz <= 1)

    return false;

  if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))

    return false;

  if (has_single_bit(Sz))

    return true;

  const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));

  return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&

         Sz % NumParts == 0;

}


/// Returns number of parts, the type \p VecTy will be split at the codegen

/// phase. If the type is going to be scalarized or does not uses whole

/// registers, returns 1.

static unsigned

getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,

                 const unsigned Limit = std::numeric_limits<unsigned>::max()) {

  unsigned NumParts = TTI.getNumberOfParts(VecTy);

  if (NumParts == 0 || NumParts >= Limit)

    return 1;

  unsigned Sz = getNumElements(VecTy);

  if (NumParts >= Sz || Sz % NumParts != 0 ||

      !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))

    return 1;

  return NumParts;

}


namespace slpvectorizer {


/// Bottom Up SLP Vectorizer.

class BoUpSLP {

  class TreeEntry;

  class ScheduleEntity;

  class ScheduleData;

  class ScheduleCopyableData;

  class ScheduleBundle;

  class ShuffleCostEstimator;

  class ShuffleInstructionBuilder;


public:

  /// Tracks the state we can represent the loads in the given sequence.

  enum class LoadsState {

    Gather,

    Vectorize,

    ScatterVectorize,

    StridedVectorize,

    CompressVectorize

  };


  using ValueList = SmallVector<Value *, 8>;

  using InstrList = SmallVector<Instruction *, 16>;

  using ValueSet = SmallPtrSet<Value *, 16>;

  using StoreList = SmallVector<StoreInst *, 8>;

  using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;

  using OrdersType = SmallVector<unsigned, 4>;


  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,

          TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,

          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,

          const DataLayout *DL, OptimizationRemarkEmitter *ORE)

      : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),

        AC(AC), DB(DB), DL(DL), ORE(ORE),

        Builder(Se->getContext(), TargetFolder(*DL)) {

    CodeMetrics::collectEphemeralValues(F, AC, EphValues);

    // Use the vector register size specified by the target unless overridden

    // by a command-line option.

    // TODO: It would be better to limit the vectorization factor based on

    //       data type rather than just register size. For example, x86 AVX has

    //       256-bit registers, but it does not support integer operations

    //       at that width (that requires AVX2).

    if (MaxVectorRegSizeOption.getNumOccurrences())

      MaxVecRegSize = MaxVectorRegSizeOption;

    else

      MaxVecRegSize =

          TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

              .getFixedValue();


    if (MinVectorRegSizeOption.getNumOccurrences())

      MinVecRegSize = MinVectorRegSizeOption;

    else

      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();

  }


  /// Vectorize the tree that starts with the elements in \p VL.

  /// Returns the vectorized root.

  Value *vectorizeTree();


  /// Vectorize the tree but with the list of externally used values \p

  /// ExternallyUsedValues. Values in this MapVector can be replaced but the

  /// generated extractvalue instructions.

  Value *vectorizeTree(

      const ExtraValueToDebugLocsMap &ExternallyUsedValues,

      Instruction *ReductionRoot = nullptr,

      ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});


  /// \returns the cost incurred by unwanted spills and fills, caused by

  /// holding live values over call sites.

  InstructionCost getSpillCost();


  /// \returns the vectorization cost of the subtree that starts at \p VL.

  /// A negative number means that this is profitable.

  InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},

                              InstructionCost ReductionCost = TTI::TCC_Free);


  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for

  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.

  void buildTree(ArrayRef<Value *> Roots,

                 const SmallDenseSet<Value *> &UserIgnoreLst);


  /// Construct a vectorizable tree that starts at \p Roots.

  void buildTree(ArrayRef<Value *> Roots);


  /// Return the scalars of the root node.

  ArrayRef<Value *> getRootNodeScalars() const {

    assert(!VectorizableTree.empty() && "No graph to get the first node from");

    return VectorizableTree.front()->Scalars;

  }


  /// Returns the type/is-signed info for the root node in the graph without

  /// casting.

  std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {

    const TreeEntry &Root = *VectorizableTree.front();

    if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||

        !Root.Scalars.front()->getType()->isIntegerTy())

      return std::nullopt;

    auto It = MinBWs.find(&Root);

    if (It != MinBWs.end())

      return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),

                                             It->second.first),

                            It->second.second);

    if (Root.getOpcode() == Instruction::ZExt ||

        Root.getOpcode() == Instruction::SExt)

      return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),

                            Root.getOpcode() == Instruction::SExt);

    return std::nullopt;

  }


  /// Checks if the root graph node can be emitted with narrower bitwidth at

  /// codegen and returns it signedness, if so.

  bool isSignedMinBitwidthRootNode() const {

    return MinBWs.at(VectorizableTree.front().get()).second;

  }


  /// Returns reduction type after minbitdth analysis.

  FixedVectorType *getReductionType() const {

    if (ReductionBitWidth == 0 ||

        !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||

        ReductionBitWidth >=

            DL->getTypeSizeInBits(

                VectorizableTree.front()->Scalars.front()->getType()))

      return getWidenedType(

          VectorizableTree.front()->Scalars.front()->getType(),

          VectorizableTree.front()->getVectorFactor());

    return getWidenedType(

        IntegerType::get(

            VectorizableTree.front()->Scalars.front()->getContext(),

            ReductionBitWidth),

        VectorizableTree.front()->getVectorFactor());

  }


  /// Builds external uses of the vectorized scalars, i.e. the list of

  /// vectorized scalars to be extracted, their lanes and their scalar users. \p

  /// ExternallyUsedValues contains additional list of external uses to handle

  /// vectorization of reductions.

  void

  buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});


  /// Transforms graph nodes to target specific representations, if profitable.

  void transformNodes();


  /// Clear the internal data structures that are created by 'buildTree'.

  void deleteTree() {

    VectorizableTree.clear();

    ScalarToTreeEntries.clear();

    OperandsToTreeEntry.clear();

    ScalarsInSplitNodes.clear();

    MustGather.clear();

    NonScheduledFirst.clear();

    EntryToLastInstruction.clear();

    LoadEntriesToVectorize.clear();

    IsGraphTransformMode = false;

    GatheredLoadsEntriesFirst.reset();

    CompressEntryToData.clear();

    ExternalUses.clear();

    ExternalUsesAsOriginalScalar.clear();

    ExternalUsesWithNonUsers.clear();

    for (auto &Iter : BlocksSchedules) {

      BlockScheduling *BS = Iter.second.get();

      BS->clear();

    }

    MinBWs.clear();

    ReductionBitWidth = 0;

    BaseGraphSize = 1;

    CastMaxMinBWSizes.reset();

    ExtraBitWidthNodes.clear();

    InstrElementSize.clear();

    UserIgnoreList = nullptr;

    PostponedGathers.clear();

    ValueToGatherNodes.clear();

  }


  unsigned getTreeSize() const { return VectorizableTree.size(); }


  /// Returns the base graph size, before any transformations.

  unsigned getCanonicalGraphSize() const { return BaseGraphSize; }


  /// Perform LICM and CSE on the newly generated gather sequences.

  void optimizeGatherSequence();


  /// Does this non-empty order represent an identity order?  Identity

  /// should be represented as an empty order, so this is used to

  /// decide if we can canonicalize a computed order.  Undef elements

  /// (represented as size) are ignored.

  static bool isIdentityOrder(ArrayRef<unsigned> Order) {

    assert(!Order.empty() && "expected non-empty order");

    const unsigned Sz = Order.size();

    return all_of(enumerate(Order), [&](const auto &P) {

      return P.value() == P.index() || P.value() == Sz;

    });

  }


  /// Checks if the specified gather tree entry \p TE can be represented as a

  /// shuffled vector entry + (possibly) permutation with other gathers. It

  /// implements the checks only for possibly ordered scalars (Loads,

  /// ExtractElement, ExtractValue), which can be part of the graph.

  /// \param TopToBottom If true, used for the whole tree rotation, false - for

  /// sub-tree rotations. \param IgnoreReorder true, if the order of the root

  /// node might be ignored.

  std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,

                                                     bool TopToBottom,

                                                     bool IgnoreReorder);


  /// Sort loads into increasing pointers offsets to allow greater clustering.

  std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);


  /// Gets reordering data for the given tree entry. If the entry is vectorized

  /// - just return ReorderIndices, otherwise check if the scalars can be

  /// reordered and return the most optimal order.

  /// \return std::nullopt if ordering is not important, empty order, if

  /// identity order is important, or the actual order.

  /// \param TopToBottom If true, include the order of vectorized stores and

  /// insertelement nodes, otherwise skip them.

  /// \param IgnoreReorder true, if the root node order can be ignored.

  std::optional<OrdersType>

  getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);


  /// Checks if it is profitable to reorder the current tree.

  /// If the tree does not contain many profitable reordable nodes, better to

  /// skip it to save compile time.

  bool isProfitableToReorder() const;


  /// Reorders the current graph to the most profitable order starting from the

  /// root node to the leaf nodes. The best order is chosen only from the nodes

  /// of the same size (vectorization factor). Smaller nodes are considered

  /// parts of subgraph with smaller VF and they are reordered independently. We

  /// can make it because we still need to extend smaller nodes to the wider VF

  /// and we can merge reordering shuffles with the widening shuffles.

  void reorderTopToBottom();


  /// Reorders the current graph to the most profitable order starting from

  /// leaves to the root. It allows to rotate small subgraphs and reduce the

  /// number of reshuffles if the leaf nodes use the same order. In this case we

  /// can merge the orders and just shuffle user node instead of shuffling its

  /// operands. Plus, even the leaf nodes have different orders, it allows to

  /// sink reordering in the graph closer to the root node and merge it later

  /// during analysis.

  void reorderBottomToTop(bool IgnoreReorder = false);


  /// \return The vector element size in bits to use when vectorizing the

  /// expression tree ending at \p V. If V is a store, the size is the width of

  /// the stored value. Otherwise, the size is the width of the largest loaded

  /// value reaching V. This method is used by the vectorizer to calculate

  /// vectorization factors.

  unsigned getVectorElementSize(Value *V);


  /// Compute the minimum type sizes required to represent the entries in a

  /// vectorizable tree.

  void computeMinimumValueSizes();


  // \returns maximum vector register size as set by TTI or overridden by cl::opt.

  unsigned getMaxVecRegSize() const {

    return MaxVecRegSize;

  }


  // \returns minimum vector register size as set by cl::opt.

  unsigned getMinVecRegSize() const {

    return MinVecRegSize;

  }


  unsigned getMinVF(unsigned Sz) const {

    return std::max(2U, getMinVecRegSize() / Sz);

  }


  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

    unsigned MaxVF = MaxVFOption.getNumOccurrences() ?

      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);

    return MaxVF ? MaxVF : UINT_MAX;

  }


  /// Check if homogeneous aggregate is isomorphic to some VectorType.

  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like

  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },

  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

  ///

  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.

  unsigned canMapToVector(Type *T) const;


  /// \returns True if the VectorizableTree is both tiny and not fully

  /// vectorizable. We do not vectorize such trees.

  bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;


  /// Checks if the graph and all its subgraphs cannot be better vectorized.

  /// It may happen, if all gather nodes are loads and they cannot be

  /// "clusterized". In this case even subgraphs cannot be vectorized more

  /// effectively than the base graph.

  bool isTreeNotExtendable() const;


  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values

  /// can be load combined in the backend. Load combining may not be allowed in

  /// the IR optimizer, so we do not want to alter the pattern. For example,

  /// partially transforming a scalar bswap() pattern into vector code is

  /// effectively impossible for the backend to undo.

  /// TODO: If load combining is allowed in the IR optimizer, this analysis

  ///       may not be necessary.

  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;


  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values

  /// can be load combined in the backend. Load combining may not be allowed in

  /// the IR optimizer, so we do not want to alter the pattern. For example,

  /// partially transforming a scalar bswap() pattern into vector code is

  /// effectively impossible for the backend to undo.

  /// TODO: If load combining is allowed in the IR optimizer, this analysis

  ///       may not be necessary.

  bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;


  /// Checks if the given array of loads can be represented as a vectorized,

  /// scatter or just simple gather.

  /// \param VL list of loads.

  /// \param VL0 main load value.

  /// \param Order returned order of load instructions.

  /// \param PointerOps returned list of pointer operands.

  /// \param BestVF return best vector factor, if recursive check found better

  /// vectorization sequences rather than masked gather.

  /// \param TryRecursiveCheck used to check if long masked gather can be

  /// represented as a serie of loads/insert subvector, if profitable.

  LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

                               SmallVectorImpl<unsigned> &Order,

                               SmallVectorImpl<Value *> &PointerOps,

                               unsigned *BestVF = nullptr,

                               bool TryRecursiveCheck = true) const;


  /// Registers non-vectorizable sequence of loads

  template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {

    ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));

  }


  /// Checks if the given loads sequence is known as not vectorizable

  template <typename T>

  bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const {

    return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));

  }


  OptimizationRemarkEmitter *getORE() { return ORE; }


  /// This structure holds any data we need about the edges being traversed

  /// during buildTreeRec(). We keep track of:

  /// (i) the user TreeEntry index, and

  /// (ii) the index of the edge.

  struct EdgeInfo {

    EdgeInfo() = default;

    EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)

        : UserTE(UserTE), EdgeIdx(EdgeIdx) {}

    /// The user TreeEntry.

    TreeEntry *UserTE = nullptr;

    /// The operand index of the use.

    unsigned EdgeIdx = UINT_MAX;

#ifndef NDEBUG

    friend inline raw_ostream &operator<<(raw_ostream &OS,

                                          const BoUpSLP::EdgeInfo &EI) {

      EI.dump(OS);

      return OS;

    }

    /// Debug print.

    void dump(raw_ostream &OS) const {

      OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")

         << " EdgeIdx:" << EdgeIdx << "}";

    }

    LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }

#endif

    bool operator == (const EdgeInfo &Other) const {

      return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;

    }


    operator bool() const { return UserTE != nullptr; }

  };

  friend struct DenseMapInfo<EdgeInfo>;


  /// A helper class used for scoring candidates for two consecutive lanes.

  class LookAheadHeuristics {

    const TargetLibraryInfo &TLI;

    const DataLayout &DL;

    ScalarEvolution &SE;

    const BoUpSLP &R;

    int NumLanes; // Total number of lanes (aka vectorization factor).

    int MaxLevel; // The maximum recursion depth for accumulating score.


  public:

    LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,

                        ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,

                        int MaxLevel)

        : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),

          MaxLevel(MaxLevel) {}


    // The hard-coded scores listed here are not very important, though it shall

    // be higher for better matches to improve the resulting cost. When

    // computing the scores of matching one sub-tree with another, we are

    // basically counting the number of values that are matching. So even if all

    // scores are set to 1, we would still get a decent matching result.

    // However, sometimes we have to break ties. For example we may have to

    // choose between matching loads vs matching opcodes. This is what these

    // scores are helping us with: they provide the order of preference. Also,

    // this is important if the scalar is externally used or used in another

    // tree entry node in the different lane.


    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

    static const int ScoreConsecutiveLoads = 4;

    /// The same load multiple times. This should have a better score than

    /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it

    /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for

    /// a vector load and 1.0 for a broadcast.

    static const int ScoreSplatLoads = 3;

    /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

    static const int ScoreReversedLoads = 3;

    /// A load candidate for masked gather.

    static const int ScoreMaskedGatherCandidate = 1;

    /// ExtractElementInst from same vector and consecutive indexes.

    static const int ScoreConsecutiveExtracts = 4;

    /// ExtractElementInst from same vector and reversed indices.

    static const int ScoreReversedExtracts = 3;

    /// Constants.

    static const int ScoreConstants = 2;

    /// Instructions with the same opcode.

    static const int ScoreSameOpcode = 2;

    /// Instructions with alt opcodes (e.g, add + sub).

    static const int ScoreAltOpcodes = 1;

    /// Identical instructions (a.k.a. splat or broadcast).

    static const int ScoreSplat = 1;

    /// Matching with an undef is preferable to failing.

    static const int ScoreUndef = 1;

    /// Score for failing to find a decent match.

    static const int ScoreFail = 0;

    /// Score if all users are vectorized.

    static const int ScoreAllUserVectorized = 1;


    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

    /// \p U1 and \p U2 are the users of \p V1 and \p V2.

    /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p

    /// MainAltOps.

    int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,

                        ArrayRef<Value *> MainAltOps) const {

      if (!isValidElementType(V1->getType()) ||

          !isValidElementType(V2->getType()))

        return LookAheadHeuristics::ScoreFail;


      if (V1 == V2) {

        if (isa<LoadInst>(V1)) {

          // Retruns true if the users of V1 and V2 won't need to be extracted.

          auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {

            // Bail out if we have too many uses to save compilation time.

            if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))

              return false;


            auto AllUsersVectorized = [U1, U2, this](Value *V) {

              return llvm::all_of(V->users(), [U1, U2, this](Value *U) {

                return U == U1 || U == U2 || R.isVectorized(U);

              });

            };

            return AllUsersVectorized(V1) && AllUsersVectorized(V2);

          };

          // A broadcast of a load can be cheaper on some targets.

          if (R.TTI->isLegalBroadcastLoad(V1->getType(),

                                          ElementCount::getFixed(NumLanes)) &&

              ((int)V1->getNumUses() == NumLanes ||

               AllUsersAreInternal(V1, V2)))

            return LookAheadHeuristics::ScoreSplatLoads;

        }

        return LookAheadHeuristics::ScoreSplat;

      }


      auto CheckSameEntryOrFail = [&]() {

        if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {

          SmallPtrSet<TreeEntry *, 4> Set(llvm::from_range, TEs1);

          if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);

              !TEs2.empty() &&

              any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))

            return LookAheadHeuristics::ScoreSplatLoads;

        }

        return LookAheadHeuristics::ScoreFail;

      };


      auto *LI1 = dyn_cast<LoadInst>(V1);

      auto *LI2 = dyn_cast<LoadInst>(V2);

      if (LI1 && LI2) {

        if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||

            !LI2->isSimple())

          return CheckSameEntryOrFail();


        std::optional<int64_t> Dist = getPointersDiff(

            LI1->getType(), LI1->getPointerOperand(), LI2->getType(),

            LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);

        if (!Dist || *Dist == 0) {

          if (getUnderlyingObject(LI1->getPointerOperand()) ==

                  getUnderlyingObject(LI2->getPointerOperand()) &&

              R.TTI->isLegalMaskedGather(

                  getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))

            return LookAheadHeuristics::ScoreMaskedGatherCandidate;

          return CheckSameEntryOrFail();

        }

        // The distance is too large - still may be profitable to use masked

        // loads/gathers.

        if (std::abs(*Dist) > NumLanes / 2)

          return LookAheadHeuristics::ScoreMaskedGatherCandidate;

        // This still will detect consecutive loads, but we might have "holes"

        // in some cases. It is ok for non-power-2 vectorization and may produce

        // better results. It should not affect current vectorization.

        return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads

                           : LookAheadHeuristics::ScoreReversedLoads;

      }


      auto *C1 = dyn_cast<Constant>(V1);

      auto *C2 = dyn_cast<Constant>(V2);

      if (C1 && C2)

        return LookAheadHeuristics::ScoreConstants;


      // Consider constants and buildvector compatible.

      if ((C1 && isa<InsertElementInst>(V2)) ||

          (C2 && isa<InsertElementInst>(V1)))

        return LookAheadHeuristics::ScoreConstants;


      // Extracts from consecutive indexes of the same vector better score as

      // the extracts could be optimized away.

      Value *EV1;

      ConstantInt *Ex1Idx;

      if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {

        // Undefs are always profitable for extractelements.

        // Compiler can easily combine poison and extractelement <non-poison> or

        // undef and extractelement <poison>. But combining undef +

        // extractelement <non-poison-but-may-produce-poison> requires some

        // extra operations.

        if (isa<UndefValue>(V2))

          return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())

                     ? LookAheadHeuristics::ScoreConsecutiveExtracts

                     : LookAheadHeuristics::ScoreSameOpcode;

        Value *EV2 = nullptr;

        ConstantInt *Ex2Idx = nullptr;

        if (match(V2,

                  m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),

                                                         m_Undef())))) {

          // Undefs are always profitable for extractelements.

          if (!Ex2Idx)

            return LookAheadHeuristics::ScoreConsecutiveExtracts;

          if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())

            return LookAheadHeuristics::ScoreConsecutiveExtracts;

          if (EV2 == EV1) {

            int Idx1 = Ex1Idx->getZExtValue();

            int Idx2 = Ex2Idx->getZExtValue();

            int Dist = Idx2 - Idx1;

            // The distance is too large - still may be profitable to use

            // shuffles.

            if (std::abs(Dist) == 0)

              return LookAheadHeuristics::ScoreSplat;

            if (std::abs(Dist) > NumLanes / 2)

              return LookAheadHeuristics::ScoreSameOpcode;

            return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts

                              : LookAheadHeuristics::ScoreReversedExtracts;

          }

          return LookAheadHeuristics::ScoreAltOpcodes;

        }

        return CheckSameEntryOrFail();

      }


      auto *I1 = dyn_cast<Instruction>(V1);

      auto *I2 = dyn_cast<Instruction>(V2);

      if (I1 && I2) {

        if (I1->getParent() != I2->getParent())

          return CheckSameEntryOrFail();

        SmallVector<Value *, 4> Ops(MainAltOps);

        Ops.push_back(I1);

        Ops.push_back(I2);

        InstructionsState S = getSameOpcode(Ops, TLI);

        // Note: Only consider instructions with <= 2 operands to avoid

        // complexity explosion.

        if (S &&

            (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||

             !S.isAltShuffle()) &&

            all_of(Ops, [&S](Value *V) {

              return isa<PoisonValue>(V) ||

                     cast<Instruction>(V)->getNumOperands() ==

                         S.getMainOp()->getNumOperands();

            }))

          return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes

                                  : LookAheadHeuristics::ScoreSameOpcode;

      }


      if (I1 && isa<PoisonValue>(V2))

        return LookAheadHeuristics::ScoreSameOpcode;


      if (isa<UndefValue>(V2))

        return LookAheadHeuristics::ScoreUndef;


      return CheckSameEntryOrFail();

    }


    /// Go through the operands of \p LHS and \p RHS recursively until

    /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are

    /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands

    /// of \p U1 and \p U2), except at the beginning of the recursion where

    /// these are set to nullptr.

    ///

    /// For example:

    /// \verbatim

    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]

    ///     \ /         \ /         \ /        \ /

    ///      +           +           +          +

    ///     G1          G2          G3         G4

    /// \endverbatim

    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at

    /// each level recursively, accumulating the score. It starts from matching

    /// the additions at level 0, then moves on to the loads (level 1). The

    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and

    /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while

    /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.

    /// Please note that the order of the operands does not matter, as we

    /// evaluate the score of all profitable combinations of operands. In

    /// other words the score of G1 and G4 is the same as G1 and G2. This

    /// heuristic is based on ideas described in:

    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative

    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,

    ///   Luís F. W. Góes

    int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,

                           Instruction *U2, int CurrLevel,

                           ArrayRef<Value *> MainAltOps) const {


      // Get the shallow score of V1 and V2.

      int ShallowScoreAtThisLevel =

          getShallowScore(LHS, RHS, U1, U2, MainAltOps);


      // If reached MaxLevel,

      //  or if V1 and V2 are not instructions,

      //  or if they are SPLAT,

      //  or if they are not consecutive,

      //  or if profitable to vectorize loads or extractelements, early return

      //  the current cost.

      auto *I1 = dyn_cast<Instruction>(LHS);

      auto *I2 = dyn_cast<Instruction>(RHS);

      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

          ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||

          (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||

            (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||

            (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&

           ShallowScoreAtThisLevel))

        return ShallowScoreAtThisLevel;

      assert(I1 && I2 && "Should have early exited.");


      // Contains the I2 operand indexes that got matched with I1 operands.

      SmallSet<unsigned, 4> Op2Used;


      // Recursion towards the operands of I1 and I2. We are trying all possible

      // operand pairs, and keeping track of the best score.

      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

           OpIdx1 != NumOperands1; ++OpIdx1) {

        // Try to pair op1I with the best operand of I2.

        int MaxTmpScore = 0;

        unsigned MaxOpIdx2 = 0;

        bool FoundBest = false;

        // If I2 is commutative try all combinations.

        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;

        unsigned ToIdx = isCommutative(I2)

                             ? I2->getNumOperands()

                             : std::min(I2->getNumOperands(), OpIdx1 + 1);

        assert(FromIdx <= ToIdx && "Bad index");

        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {

          // Skip operands already paired with OpIdx1.

          if (Op2Used.count(OpIdx2))

            continue;

          // Recursively calculate the cost at each level

          int TmpScore =

              getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),

                                 I1, I2, CurrLevel + 1, {});

          // Look for the best score.

          if (TmpScore > LookAheadHeuristics::ScoreFail &&

              TmpScore > MaxTmpScore) {

            MaxTmpScore = TmpScore;

            MaxOpIdx2 = OpIdx2;

            FoundBest = true;

          }

        }

        if (FoundBest) {

          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.

          Op2Used.insert(MaxOpIdx2);

          ShallowScoreAtThisLevel += MaxTmpScore;

        }

      }

      return ShallowScoreAtThisLevel;

    }

  };

  /// A helper data structure to hold the operands of a vector of instructions.

  /// This supports a fixed vector length for all operand vectors.

  class VLOperands {

    /// For each operand we need (i) the value, and (ii) the opcode that it

    /// would be attached to if the expression was in a left-linearized form.

    /// This is required to avoid illegal operand reordering.

    /// For example:

    /// \verbatim

    ///                         0 Op1

    ///                         |/

    /// Op1 Op2   Linearized    + Op2

    ///   \ /     ---------->   |/

    ///    -                    -

    ///

    /// Op1 - Op2            (0 + Op1) - Op2

    /// \endverbatim

    ///

    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.

    ///

    /// Another way to think of this is to track all the operations across the

    /// path from the operand all the way to the root of the tree and to

    /// calculate the operation that corresponds to this path. For example, the

    /// path from Op2 to the root crosses the RHS of the '-', therefore the

    /// corresponding operation is a '-' (which matches the one in the

    /// linearized tree, as shown above).

    ///

    /// For lack of a better term, we refer to this operation as Accumulated

    /// Path Operation (APO).

    struct OperandData {

      OperandData() = default;

      OperandData(Value *V, bool APO, bool IsUsed)

          : V(V), APO(APO), IsUsed(IsUsed) {}

      /// The operand value.

      Value *V = nullptr;

      /// TreeEntries only allow a single opcode, or an alternate sequence of

      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the

      /// APO. It is set to 'true' if 'V' is attached to an inverse operation

      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise

      /// (e.g., Add/Mul)

      bool APO = false;

      /// Helper data for the reordering function.

      bool IsUsed = false;

    };


    /// During operand reordering, we are trying to select the operand at lane

    /// that matches best with the operand at the neighboring lane. Our

    /// selection is based on the type of value we are looking for. For example,

    /// if the neighboring lane has a load, we need to look for a load that is

    /// accessing a consecutive address. These strategies are summarized in the

    /// 'ReorderingMode' enumerator.

    enum class ReorderingMode {

      Load,     ///< Matching loads to consecutive memory addresses

      Opcode,   ///< Matching instructions based on opcode (same or alternate)

      Constant, ///< Matching constants

      Splat,    ///< Matching the same instruction multiple times (broadcast)

      Failed,   ///< We failed to create a vectorizable group

    };


    using OperandDataVec = SmallVector<OperandData, 2>;


    /// A vector of operand vectors.

    SmallVector<OperandDataVec, 4> OpsVec;

    /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]

    /// is not IntrinsicInst, ArgSize is User::getNumOperands.

    unsigned ArgSize = 0;


    const TargetLibraryInfo &TLI;

    const DataLayout &DL;

    ScalarEvolution &SE;

    const BoUpSLP &R;

    const Loop *L = nullptr;


    /// \returns the operand data at \p OpIdx and \p Lane.

    OperandData &getData(unsigned OpIdx, unsigned Lane) {

      return OpsVec[OpIdx][Lane];

    }


    /// \returns the operand data at \p OpIdx and \p Lane. Const version.

    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {

      return OpsVec[OpIdx][Lane];

    }


    /// Clears the used flag for all entries.

    void clearUsed() {

      for (unsigned OpIdx = 0, NumOperands = getNumOperands();

           OpIdx != NumOperands; ++OpIdx)

        for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

             ++Lane)

          OpsVec[OpIdx][Lane].IsUsed = false;

    }


    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.

    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {

      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

    }


    /// \param Lane lane of the operands under analysis.

    /// \param OpIdx operand index in \p Lane lane we're looking the best

    /// candidate for.

    /// \param Idx operand index of the current candidate value.

    /// \returns The additional score due to possible broadcasting of the

    /// elements in the lane. It is more profitable to have power-of-2 unique

    /// elements in the lane, it will be vectorized with higher probability

    /// after removing duplicates. Currently the SLP vectorizer supports only

    /// vectorization of the power-of-2 number of unique scalars.

    int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,

                      const SmallBitVector &UsedLanes) const {

      Value *IdxLaneV = getData(Idx, Lane).V;

      if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||

          isa<ExtractElementInst>(IdxLaneV))

        return 0;

      SmallDenseMap<Value *, unsigned, 4> Uniques;

      for (unsigned Ln : seq<unsigned>(getNumLanes())) {

        if (Ln == Lane)

          continue;

        Value *OpIdxLnV = getData(OpIdx, Ln).V;

        if (!isa<Instruction>(OpIdxLnV))

          return 0;

        Uniques.try_emplace(OpIdxLnV, Ln);

      }

      unsigned UniquesCount = Uniques.size();

      auto IdxIt = Uniques.find(IdxLaneV);

      unsigned UniquesCntWithIdxLaneV =

          IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;

      Value *OpIdxLaneV = getData(OpIdx, Lane).V;

      auto OpIdxIt = Uniques.find(OpIdxLaneV);

      unsigned UniquesCntWithOpIdxLaneV =

          OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;

      if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)

        return 0;

      return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -

                          UniquesCntWithOpIdxLaneV,

                      UniquesCntWithOpIdxLaneV -

                          bit_floor(UniquesCntWithOpIdxLaneV)) -

             ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))

                  ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)

                  : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);

    }


    /// \param Lane lane of the operands under analysis.

    /// \param OpIdx operand index in \p Lane lane we're looking the best

    /// candidate for.

    /// \param Idx operand index of the current candidate value.

    /// \returns The additional score for the scalar which users are all

    /// vectorized.

    int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {

      Value *IdxLaneV = getData(Idx, Lane).V;

      Value *OpIdxLaneV = getData(OpIdx, Lane).V;

      // Do not care about number of uses for vector-like instructions

      // (extractelement/extractvalue with constant indices), they are extracts

      // themselves and already externally used. Vectorization of such

      // instructions does not add extra extractelement instruction, just may

      // remove it.

      if (isVectorLikeInstWithConstOps(IdxLaneV) &&

          isVectorLikeInstWithConstOps(OpIdxLaneV))

        return LookAheadHeuristics::ScoreAllUserVectorized;

      auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);

      if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))

        return 0;

      return R.areAllUsersVectorized(IdxLaneI)

                 ? LookAheadHeuristics::ScoreAllUserVectorized

                 : 0;

    }


    /// Score scaling factor for fully compatible instructions but with

    /// different number of external uses. Allows better selection of the

    /// instructions with less external uses.

    static const int ScoreScaleFactor = 10;


    /// \Returns the look-ahead score, which tells us how much the sub-trees

    /// rooted at \p LHS and \p RHS match, the more they match the higher the

    /// score. This helps break ties in an informed way when we cannot decide on

    /// the order of the operands by just considering the immediate

    /// predecessors.

    int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,

                          int Lane, unsigned OpIdx, unsigned Idx,

                          bool &IsUsed, const SmallBitVector &UsedLanes) {

      LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),

                                    LookAheadMaxDepth);

      // Keep track of the instruction stack as we recurse into the operands

      // during the look-ahead score exploration.

      int Score =

          LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,

                                       /*CurrLevel=*/1, MainAltOps);

      if (Score) {

        int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);

        if (Score <= -SplatScore) {

          // Failed score.

          Score = 0;

        } else {

          Score += SplatScore;

          // Scale score to see the difference between different operands

          // and similar operands but all vectorized/not all vectorized

          // uses. It does not affect actual selection of the best

          // compatible operand in general, just allows to select the

          // operand with all vectorized uses.

          Score *= ScoreScaleFactor;

          Score += getExternalUseScore(Lane, OpIdx, Idx);

          IsUsed = true;

        }

      }

      return Score;

    }


    /// Best defined scores per lanes between the passes. Used to choose the

    /// best operand (with the highest score) between the passes.

    /// The key - {Operand Index, Lane}.

    /// The value - the best score between the passes for the lane and the

    /// operand.

    SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>

        BestScoresPerLanes;


    // Search all operands in Ops[*][Lane] for the one that matches best

    // Ops[OpIdx][LastLane] and return its opreand index.

    // If no good match can be found, return std::nullopt.

    std::optional<unsigned>

    getBestOperand(unsigned OpIdx, int Lane, int LastLane,

                   ArrayRef<ReorderingMode> ReorderingModes,

                   ArrayRef<Value *> MainAltOps,

                   const SmallBitVector &UsedLanes) {

      unsigned NumOperands = getNumOperands();


      // The operand of the previous lane at OpIdx.

      Value *OpLastLane = getData(OpIdx, LastLane).V;


      // Our strategy mode for OpIdx.

      ReorderingMode RMode = ReorderingModes[OpIdx];

      if (RMode == ReorderingMode::Failed)

        return std::nullopt;


      // The linearized opcode of the operand at OpIdx, Lane.

      bool OpIdxAPO = getData(OpIdx, Lane).APO;


      // The best operand index and its score.

      // Sometimes we have more than one option (e.g., Opcode and Undefs), so we

      // are using the score to differentiate between the two.

      struct BestOpData {

        std::optional<unsigned> Idx;

        unsigned Score = 0;

      } BestOp;

      BestOp.Score =

          BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)

              .first->second;


      // Track if the operand must be marked as used. If the operand is set to

      // Score 1 explicitly (because of non power-of-2 unique scalars, we may

      // want to reestimate the operands again on the following iterations).

      bool IsUsed = RMode == ReorderingMode::Splat ||

                    RMode == ReorderingMode::Constant ||

                    RMode == ReorderingMode::Load;

      // Iterate through all unused operands and look for the best.

      for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {

        // Get the operand at Idx and Lane.

        OperandData &OpData = getData(Idx, Lane);

        Value *Op = OpData.V;

        bool OpAPO = OpData.APO;


        // Skip already selected operands.

        if (OpData.IsUsed)

          continue;


        // Skip if we are trying to move the operand to a position with a

        // different opcode in the linearized tree form. This would break the

        // semantics.

        if (OpAPO != OpIdxAPO)

          continue;


        // Look for an operand that matches the current mode.

        switch (RMode) {

        case ReorderingMode::Load:

        case ReorderingMode::Opcode: {

          bool LeftToRight = Lane > LastLane;

          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

          Value *OpRight = (LeftToRight) ? Op : OpLastLane;

          int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,

                                        OpIdx, Idx, IsUsed, UsedLanes);

          if (Score > static_cast<int>(BestOp.Score) ||

              (Score > 0 && Score == static_cast<int>(BestOp.Score) &&

               Idx == OpIdx)) {

            BestOp.Idx = Idx;

            BestOp.Score = Score;

            BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;

          }

          break;

        }

        case ReorderingMode::Constant:

          if (isa<Constant>(Op) ||

              (!BestOp.Score && L && L->isLoopInvariant(Op))) {

            BestOp.Idx = Idx;

            if (isa<Constant>(Op)) {

              BestOp.Score = LookAheadHeuristics::ScoreConstants;

              BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =

                  LookAheadHeuristics::ScoreConstants;

            }

            if (isa<UndefValue>(Op) || !isa<Constant>(Op))

              IsUsed = false;

          }

          break;

        case ReorderingMode::Splat:

          if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {

            IsUsed = Op == OpLastLane;

            if (Op == OpLastLane) {

              BestOp.Score = LookAheadHeuristics::ScoreSplat;

              BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =

                  LookAheadHeuristics::ScoreSplat;

            }

            BestOp.Idx = Idx;

          }

          break;

        case ReorderingMode::Failed:

          llvm_unreachable("Not expected Failed reordering mode.");

        }

      }


      if (BestOp.Idx) {

        getData(*BestOp.Idx, Lane).IsUsed = IsUsed;

        return BestOp.Idx;

      }

      // If we could not find a good match return std::nullopt.

      return std::nullopt;

    }


    /// Helper for reorderOperandVecs.

    /// \returns the lane that we should start reordering from. This is the one

    /// which has the least number of operands that can freely move about or

    /// less profitable because it already has the most optimal set of operands.

    unsigned getBestLaneToStartReordering() const {

      unsigned Min = UINT_MAX;

      unsigned SameOpNumber = 0;

      // std::pair<unsigned, unsigned> is used to implement a simple voting

      // algorithm and choose the lane with the least number of operands that

      // can freely move about or less profitable because it already has the

      // most optimal set of operands. The first unsigned is a counter for

      // voting, the second unsigned is the counter of lanes with instructions

      // with same/alternate opcodes and same parent basic block.

      MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;

      // Try to be closer to the original results, if we have multiple lanes

      // with same cost. If 2 lanes have the same cost, use the one with the

      // highest index.

      for (int I = getNumLanes(); I > 0; --I) {

        unsigned Lane = I - 1;

        OperandsOrderData NumFreeOpsHash =

            getMaxNumOperandsThatCanBeReordered(Lane);

        // Compare the number of operands that can move and choose the one with

        // the least number.

        if (NumFreeOpsHash.NumOfAPOs < Min) {

          Min = NumFreeOpsHash.NumOfAPOs;

          SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

          HashMap.clear();

          HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

        } else if (NumFreeOpsHash.NumOfAPOs == Min &&

                   NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {

          // Select the most optimal lane in terms of number of operands that

          // should be moved around.

          SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

          HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

        } else if (NumFreeOpsHash.NumOfAPOs == Min &&

                   NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

          auto [It, Inserted] =

              HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);

          if (!Inserted)

            ++It->second.first;

        }

      }

      // Select the lane with the minimum counter.

      unsigned BestLane = 0;

      unsigned CntMin = UINT_MAX;

      for (const auto &Data : reverse(HashMap)) {

        if (Data.second.first < CntMin) {

          CntMin = Data.second.first;

          BestLane = Data.second.second;

        }

      }

      return BestLane;

    }


    /// Data structure that helps to reorder operands.

    struct OperandsOrderData {

      /// The best number of operands with the same APOs, which can be

      /// reordered.

      unsigned NumOfAPOs = UINT_MAX;

      /// Number of operands with the same/alternate instruction opcode and

      /// parent.

      unsigned NumOpsWithSameOpcodeParent = 0;

      /// Hash for the actual operands ordering.

      /// Used to count operands, actually their position id and opcode

      /// value. It is used in the voting mechanism to find the lane with the

      /// least number of operands that can freely move about or less profitable

      /// because it already has the most optimal set of operands. Can be

      /// replaced with SmallVector<unsigned> instead but hash code is faster

      /// and requires less memory.

      unsigned Hash = 0;

    };

    /// \returns the maximum number of operands that are allowed to be reordered

    /// for \p Lane and the number of compatible instructions(with the same

    /// parent/opcode). This is used as a heuristic for selecting the first lane

    /// to start operand reordering.

    OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {

      unsigned CntTrue = 0;

      unsigned NumOperands = getNumOperands();

      // Operands with the same APO can be reordered. We therefore need to count

      // how many of them we have for each APO, like this: Cnt[APO] = x.

      // Since we only have two APOs, namely true and false, we can avoid using

      // a map. Instead we can simply count the number of operands that

      // correspond to one of them (in this case the 'true' APO), and calculate

      // the other by subtracting it from the total number of operands.

      // Operands with the same instruction opcode and parent are more

      // profitable since we don't need to move them in many cases, with a high

      // probability such lane already can be vectorized effectively.

      bool AllUndefs = true;

      unsigned NumOpsWithSameOpcodeParent = 0;

      Instruction *OpcodeI = nullptr;

      BasicBlock *Parent = nullptr;

      unsigned Hash = 0;

      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

        const OperandData &OpData = getData(OpIdx, Lane);

        if (OpData.APO)

          ++CntTrue;

        // Use Boyer-Moore majority voting for finding the majority opcode and

        // the number of times it occurs.

        if (auto *I = dyn_cast<Instruction>(OpData.V)) {

          if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||

              I->getParent() != Parent) {

            if (NumOpsWithSameOpcodeParent == 0) {

              NumOpsWithSameOpcodeParent = 1;

              OpcodeI = I;

              Parent = I->getParent();

            } else {

              --NumOpsWithSameOpcodeParent;

            }

          } else {

            ++NumOpsWithSameOpcodeParent;

          }

        }

        Hash = hash_combine(

            Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));

        AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);

      }

      if (AllUndefs)

        return {};

      OperandsOrderData Data;

      Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);

      Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;

      Data.Hash = Hash;

      return Data;

    }


    /// Go through the instructions in VL and append their operands.

    void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,

                        const InstructionsState &S) {

      assert(!Operands.empty() && !VL.empty() && "Bad list of operands");

      assert((empty() || all_of(Operands,

                                [this](const ValueList &VL) {

                                  return VL.size() == getNumLanes();

                                })) &&

             "Expected same number of lanes");

      assert(S.valid() && "InstructionsState is invalid.");

      // IntrinsicInst::isCommutative returns true if swapping the first "two"

      // arguments to the intrinsic produces the same result.

      Instruction *MainOp = S.getMainOp();

      unsigned NumOperands = MainOp->getNumOperands();

      ArgSize = ::getNumberOfPotentiallyCommutativeOps(MainOp);

      OpsVec.resize(ArgSize);

      unsigned NumLanes = VL.size();

      for (OperandDataVec &Ops : OpsVec)

        Ops.resize(NumLanes);

      for (unsigned Lane : seq<unsigned>(NumLanes)) {

        // Our tree has just 3 nodes: the root and two operands.

        // It is therefore trivial to get the APO. We only need to check the

        // opcode of V and whether the operand at OpIdx is the LHS or RHS

        // operand. The LHS operand of both add and sub is never attached to an

        // inversese operation in the linearized form, therefore its APO is

        // false. The RHS is true only if V is an inverse operation.


        // Since operand reordering is performed on groups of commutative

        // operations or alternating sequences (e.g., +, -), we can safely tell

        // the inverse operations by checking commutativity.

        auto *I = dyn_cast<Instruction>(VL[Lane]);

        if (!I && isa<PoisonValue>(VL[Lane])) {

          for (unsigned OpIdx : seq<unsigned>(NumOperands))

            OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};

          continue;

        }

        bool IsInverseOperation = false;

        if (S.isCopyableElement(VL[Lane])) {

          // The value is a copyable element.

          IsInverseOperation = !isCommutative(MainOp, VL[Lane]);

        } else {

          assert(I && "Expected instruction");

          auto [SelectedOp, Ops] = convertTo(I, S);

          // We cannot check commutativity by the converted instruction

          // (SelectedOp) because isCommutative also examines def-use

          // relationships.

          IsInverseOperation = !isCommutative(SelectedOp, I);

        }

        for (unsigned OpIdx : seq<unsigned>(ArgSize)) {

          bool APO = (OpIdx == 0) ? false : IsInverseOperation;

          OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};

        }

      }

    }


    /// \returns the number of operands.

    unsigned getNumOperands() const { return ArgSize; }


    /// \returns the number of lanes.

    unsigned getNumLanes() const { return OpsVec[0].size(); }


    /// \returns the operand value at \p OpIdx and \p Lane.

    Value *getValue(unsigned OpIdx, unsigned Lane) const {

      return getData(OpIdx, Lane).V;

    }


    /// \returns true if the data structure is empty.

    bool empty() const { return OpsVec.empty(); }


    /// Clears the data.

    void clear() { OpsVec.clear(); }


    /// \Returns true if there are enough operands identical to \p Op to fill

    /// the whole vector (it is mixed with constants or loop invariant values).

    /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.

    bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {

      assert(Op == getValue(OpIdx, Lane) &&

             "Op is expected to be getValue(OpIdx, Lane).");

      // Small number of loads - try load matching.

      if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)

        return false;

      bool OpAPO = getData(OpIdx, Lane).APO;

      bool IsInvariant = L && L->isLoopInvariant(Op);

      unsigned Cnt = 0;

      for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

        if (Ln == Lane)

          continue;

        // This is set to true if we found a candidate for broadcast at Lane.

        bool FoundCandidate = false;

        for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {

          OperandData &Data = getData(OpI, Ln);

          if (Data.APO != OpAPO || Data.IsUsed)

            continue;

          Value *OpILane = getValue(OpI, Lane);

          bool IsConstantOp = isa<Constant>(OpILane);

          // Consider the broadcast candidate if:

          // 1. Same value is found in one of the operands.

          if (Data.V == Op ||

              // 2. The operand in the given lane is not constant but there is a

              // constant operand in another lane (which can be moved to the

              // given lane). In this case we can represent it as a simple

              // permutation of constant and broadcast.

              (!IsConstantOp &&

               ((Lns > 2 && isa<Constant>(Data.V)) ||

                // 2.1. If we have only 2 lanes, need to check that value in the

                // next lane does not build same opcode sequence.

                (Lns == 2 &&

                 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&

                 isa<Constant>(Data.V)))) ||

              // 3. The operand in the current lane is loop invariant (can be

              // hoisted out) and another operand is also a loop invariant

              // (though not a constant). In this case the whole vector can be

              // hoisted out.

              // FIXME: need to teach the cost model about this case for better

              // estimation.

              (IsInvariant && !isa<Constant>(Data.V) &&

               !getSameOpcode({Op, Data.V}, TLI) &&

               L->isLoopInvariant(Data.V))) {

            FoundCandidate = true;

            Data.IsUsed = Data.V == Op;

            if (Data.V == Op)

              ++Cnt;

            break;

          }

        }

        if (!FoundCandidate)

          return false;

      }

      return getNumLanes() == 2 || Cnt > 1;

    }


    /// Checks if there is at least single compatible operand in lanes other

    /// than \p Lane, compatible with the operand \p Op.

    bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {

      assert(Op == getValue(OpIdx, Lane) &&

             "Op is expected to be getValue(OpIdx, Lane).");

      bool OpAPO = getData(OpIdx, Lane).APO;

      for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

        if (Ln == Lane)

          continue;

        if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {

              const OperandData &Data = getData(OpI, Ln);

              if (Data.APO != OpAPO || Data.IsUsed)

                return true;

              Value *OpILn = getValue(OpI, Ln);

              return (L && L->isLoopInvariant(OpILn)) ||

                     (getSameOpcode({Op, OpILn}, TLI) &&

                      allSameBlock({Op, OpILn}));

            }))

          return true;

      }

      return false;

    }


  public:

    /// Initialize with all the operands of the instruction vector \p RootVL.

    VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands,

               const InstructionsState &S, const BoUpSLP &R)

        : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),

          L(R.LI->getLoopFor(S.getMainOp()->getParent())) {

      // Append all the operands of RootVL.

      appendOperands(RootVL, Operands, S);

    }


    /// \Returns a value vector with the operands across all lanes for the

    /// opearnd at \p OpIdx.

    ValueList getVL(unsigned OpIdx) const {

      ValueList OpVL(OpsVec[OpIdx].size());

      assert(OpsVec[OpIdx].size() == getNumLanes() &&

             "Expected same num of lanes across all operands");

      for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)

        OpVL[Lane] = OpsVec[OpIdx][Lane].V;

      return OpVL;

    }


    // Performs operand reordering for 2 or more operands.

    // The original operands are in OrigOps[OpIdx][Lane].

    // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.

    void reorder() {

      unsigned NumOperands = getNumOperands();

      unsigned NumLanes = getNumLanes();

      // Each operand has its own mode. We are using this mode to help us select

      // the instructions for each lane, so that they match best with the ones

      // we have selected so far.

      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);


      // This is a greedy single-pass algorithm. We are going over each lane

      // once and deciding on the best order right away with no back-tracking.

      // However, in order to increase its effectiveness, we start with the lane

      // that has operands that can move the least. For example, given the

      // following lanes:

      //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd

      //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st

      //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd

      //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th

      // we will start at Lane 1, since the operands of the subtraction cannot

      // be reordered. Then we will visit the rest of the lanes in a circular

      // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.


      // Find the first lane that we will start our search from.

      unsigned FirstLane = getBestLaneToStartReordering();


      // Initialize the modes.

      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

        Value *OpLane0 = getValue(OpIdx, FirstLane);

        // Keep track if we have instructions with all the same opcode on one

        // side.

        if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {

          // Check if OpLane0 should be broadcast.

          if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||

              !canBeVectorized(OpILane0, OpIdx, FirstLane))

            ReorderingModes[OpIdx] = ReorderingMode::Splat;

          else if (isa<LoadInst>(OpILane0))

            ReorderingModes[OpIdx] = ReorderingMode::Load;

          else

            ReorderingModes[OpIdx] = ReorderingMode::Opcode;

        } else if (isa<Constant>(OpLane0)) {

          ReorderingModes[OpIdx] = ReorderingMode::Constant;

        } else if (isa<Argument>(OpLane0)) {

          // Our best hope is a Splat. It may save some cost in some cases.

          ReorderingModes[OpIdx] = ReorderingMode::Splat;

        } else {

          llvm_unreachable("Unexpected value kind.");

        }

      }


      // Check that we don't have same operands. No need to reorder if operands

      // are just perfect diamond or shuffled diamond match. Do not do it only

      // for possible broadcasts or non-power of 2 number of scalars (just for

      // now).

      auto &&SkipReordering = [this]() {

        SmallPtrSet<Value *, 4> UniqueValues;

        ArrayRef<OperandData> Op0 = OpsVec.front();

        for (const OperandData &Data : Op0)

          UniqueValues.insert(Data.V);

        for (ArrayRef<OperandData> Op :

             ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {

          if (any_of(Op, [&UniqueValues](const OperandData &Data) {

                return !UniqueValues.contains(Data.V);

              }))

            return false;

        }

        // TODO: Check if we can remove a check for non-power-2 number of

        // scalars after full support of non-power-2 vectorization.

        return UniqueValues.size() != 2 &&

               hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),

                                        UniqueValues.size());

      };


      // If the initial strategy fails for any of the operand indexes, then we

      // perform reordering again in a second pass. This helps avoid assigning

      // high priority to the failed strategy, and should improve reordering for

      // the non-failed operand indexes.

      for (int Pass = 0; Pass != 2; ++Pass) {

        // Check if no need to reorder operands since they're are perfect or

        // shuffled diamond match.

        // Need to do it to avoid extra external use cost counting for

        // shuffled matches, which may cause regressions.

        if (SkipReordering())

          break;

        // Skip the second pass if the first pass did not fail.

        bool StrategyFailed = false;

        // Mark all operand data as free to use.

        clearUsed();

        // We keep the original operand order for the FirstLane, so reorder the

        // rest of the lanes. We are visiting the nodes in a circular fashion,

        // using FirstLane as the center point and increasing the radius

        // distance.

        SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);

        for (unsigned I = 0; I < NumOperands; ++I)

          MainAltOps[I].push_back(getData(I, FirstLane).V);


        SmallBitVector UsedLanes(NumLanes);

        UsedLanes.set(FirstLane);

        for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {

          // Visit the lane on the right and then the lane on the left.

          for (int Direction : {+1, -1}) {

            int Lane = FirstLane + Direction * Distance;

            if (Lane < 0 || Lane >= (int)NumLanes)

              continue;

            UsedLanes.set(Lane);

            int LastLane = Lane - Direction;

            assert(LastLane >= 0 && LastLane < (int)NumLanes &&

                   "Out of bounds");

            // Look for a good match for each operand.

            for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

              // Search for the operand that matches SortedOps[OpIdx][Lane-1].

              std::optional<unsigned> BestIdx =

                  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,

                                 MainAltOps[OpIdx], UsedLanes);

              // By not selecting a value, we allow the operands that follow to

              // select a better matching value. We will get a non-null value in

              // the next run of getBestOperand().

              if (BestIdx) {

                // Swap the current operand with the one returned by

                // getBestOperand().

                swap(OpIdx, *BestIdx, Lane);

              } else {

                // Enable the second pass.

                StrategyFailed = true;

              }

              // Try to get the alternate opcode and follow it during analysis.

              if (MainAltOps[OpIdx].size() != 2) {

                OperandData &AltOp = getData(OpIdx, Lane);

                InstructionsState OpS =

                    getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);

                if (OpS && OpS.isAltShuffle())

                  MainAltOps[OpIdx].push_back(AltOp.V);

              }

            }

          }

        }

        // Skip second pass if the strategy did not fail.

        if (!StrategyFailed)

          break;

      }

    }


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

    LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {

      switch (RMode) {

      case ReorderingMode::Load:

        return "Load";

      case ReorderingMode::Opcode:

        return "Opcode";

      case ReorderingMode::Constant:

        return "Constant";

      case ReorderingMode::Splat:

        return "Splat";

      case ReorderingMode::Failed:

        return "Failed";

      }

      llvm_unreachable("Unimplemented Reordering Type");

    }


    LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,

                                                   raw_ostream &OS) {

      return OS << getModeStr(RMode);

    }


    /// Debug print.

    LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {

      printMode(RMode, dbgs());

    }


    friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {

      return printMode(RMode, OS);

    }


    LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {

      const unsigned Indent = 2;

      unsigned Cnt = 0;

      for (const OperandDataVec &OpDataVec : OpsVec) {

        OS << "Operand " << Cnt++ << "\n";

        for (const OperandData &OpData : OpDataVec) {

          OS.indent(Indent) << "{";

          if (Value *V = OpData.V)

            OS << *V;

          else

            OS << "null";

          OS << ", APO:" << OpData.APO << "}\n";

        }

        OS << "\n";

      }

      return OS;

    }


    /// Debug print.

    LLVM_DUMP_METHOD void dump() const { print(dbgs()); }

#endif

  };


  /// Evaluate each pair in \p Candidates and return index into \p Candidates

  /// for a pair which have highest score deemed to have best chance to form

  /// root of profitable tree to vectorize. Return std::nullopt if no candidate

  /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit

  /// of the cost, considered to be good enough score.

  std::optional<int>

  findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,

                   int Limit = LookAheadHeuristics::ScoreFail) const {

    LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,

                                  RootLookAheadMaxDepth);

    int BestScore = Limit;

    std::optional<int> Index;

    for (int I : seq<int>(0, Candidates.size())) {

      int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,

                                               Candidates[I].second,

                                               /*U1=*/nullptr, /*U2=*/nullptr,

                                               /*CurrLevel=*/1, {});

      if (Score > BestScore) {

        BestScore = Score;

        Index = I;

      }

    }

    return Index;

  }


  /// Checks if the instruction is marked for deletion.

  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }


  /// Removes an instruction from its block and eventually deletes it.

  /// It's like Instruction::eraseFromParent() except that the actual deletion

  /// is delayed until BoUpSLP is destructed.

  void eraseInstruction(Instruction *I) {

    DeletedInstructions.insert(I);

  }


  /// Remove instructions from the parent function and clear the operands of \p

  /// DeadVals instructions, marking for deletion trivially dead operands.

  template <typename T>

  void removeInstructionsAndOperands(

      ArrayRef<T *> DeadVals,

      ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {

    SmallVector<WeakTrackingVH> DeadInsts;

    for (T *V : DeadVals) {

      auto *I = cast<Instruction>(V);

      eraseInstruction(I);

    }

    DenseSet<Value *> Processed;

    for (T *V : DeadVals) {

      if (!V || !Processed.insert(V).second)

        continue;

      auto *I = cast<Instruction>(V);

      salvageDebugInfo(*I);

      ArrayRef<TreeEntry *> Entries = getTreeEntries(I);

      for (Use &U : I->operands()) {

        if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());

            OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&

            wouldInstructionBeTriviallyDead(OpI, TLI) &&

            (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {

               return Entry->VectorizedValue == OpI;

             })))

          DeadInsts.push_back(OpI);

      }

      I->dropAllReferences();

    }

    for (T *V : DeadVals) {

      auto *I = cast<Instruction>(V);

      if (!I->getParent())

        continue;

      assert((I->use_empty() || all_of(I->uses(),

                                       [&](Use &U) {

                                         return isDeleted(

                                             cast<Instruction>(U.getUser()));

                                       })) &&

             "trying to erase instruction with users.");

      I->removeFromParent();

      SE->forgetValue(I);

    }

    // Process the dead instruction list until empty.

    while (!DeadInsts.empty()) {

      Value *V = DeadInsts.pop_back_val();

      Instruction *VI = cast_or_null<Instruction>(V);

      if (!VI || !VI->getParent())

        continue;

      assert(isInstructionTriviallyDead(VI, TLI) &&

             "Live instruction found in dead worklist!");

      assert(VI->use_empty() && "Instructions with uses are not dead.");


      // Don't lose the debug info while deleting the instructions.

      salvageDebugInfo(*VI);


      // Null out all of the instruction's operands to see if any operand

      // becomes dead as we go.

      for (Use &OpU : VI->operands()) {

        Value *OpV = OpU.get();

        if (!OpV)

          continue;

        OpU.set(nullptr);


        if (!OpV->use_empty())

          continue;


        // If the operand is an instruction that became dead as we nulled out

        // the operand, and if it is 'trivially' dead, delete it in a future

        // loop iteration.

        if (auto *OpI = dyn_cast<Instruction>(OpV))

          if (!DeletedInstructions.contains(OpI) &&

              (!OpI->getType()->isVectorTy() ||

               none_of(VectorValuesAndScales,

                       [&](const std::tuple<Value *, unsigned, bool> &V) {

                         return std::get<0>(V) == OpI;

                       })) &&

              isInstructionTriviallyDead(OpI, TLI))

            DeadInsts.push_back(OpI);

      }


      VI->removeFromParent();

      eraseInstruction(VI);

      SE->forgetValue(VI);

    }

  }


  /// Checks if the instruction was already analyzed for being possible

  /// reduction root.

  bool isAnalyzedReductionRoot(Instruction *I) const {

    return AnalyzedReductionsRoots.count(I);

  }

  /// Register given instruction as already analyzed for being possible

  /// reduction root.

  void analyzedReductionRoot(Instruction *I) {

    AnalyzedReductionsRoots.insert(I);

  }

  /// Checks if the provided list of reduced values was checked already for

  /// vectorization.

  bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {

    return AnalyzedReductionVals.contains(hash_value(VL));

  }

  /// Adds the list of reduced values to list of already checked values for the

  /// vectorization.

  void analyzedReductionVals(ArrayRef<Value *> VL) {

    AnalyzedReductionVals.insert(hash_value(VL));

  }

  /// Clear the list of the analyzed reduction root instructions.

  void clearReductionData() {

    AnalyzedReductionsRoots.clear();

    AnalyzedReductionVals.clear();

    AnalyzedMinBWVals.clear();

  }

  /// Checks if the given value is gathered in one of the nodes.

  bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {

    return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });

  }

  /// Checks if the given value is gathered in one of the nodes.

  bool isGathered(const Value *V) const {

    return MustGather.contains(V);

  }

  /// Checks if the specified value was not schedule.

  bool isNotScheduled(const Value *V) const {

    return NonScheduledFirst.contains(V);

  }


  /// Check if the value is vectorized in the tree.

  bool isVectorized(const Value *V) const {

    assert(V && "V cannot be nullptr.");

    return ScalarToTreeEntries.contains(V);

  }


  ~BoUpSLP();


private:

  /// Determine if a node \p E in can be demoted to a smaller type with a

  /// truncation. We collect the entries that will be demoted in ToDemote.

  /// \param E Node for analysis

  /// \param ToDemote indices of the nodes to be demoted.

  bool collectValuesToDemote(

      const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,

      SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,

      const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,

      bool &IsProfitableToDemote, bool IsTruncRoot) const;


  /// Builds the list of reorderable operands on the edges \p Edges of the \p

  /// UserTE, which allow reordering (i.e. the operands can be reordered because

  /// they have only one user and reordarable).

  /// \param ReorderableGathers List of all gather nodes that require reordering

  /// (e.g., gather of extractlements or partially vectorizable loads).

  /// \param GatherOps List of gather operand nodes for \p UserTE that require

  /// reordering, subset of \p NonVectorized.

  void buildReorderableOperands(

      TreeEntry *UserTE,

      SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

      const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,

      SmallVectorImpl<TreeEntry *> &GatherOps);


  /// Checks if the given \p TE is a gather node with clustered reused scalars

  /// and reorders it per given \p Mask.

  void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;


  /// Checks if all users of \p I are the part of the vectorization tree.

  bool areAllUsersVectorized(

      Instruction *I,

      const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;


  /// Return information about the vector formed for the specified index

  /// of a vector of (the same) instruction.

  TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);


  /// \returns the graph entry for the \p Idx operand of the \p E entry.

  const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;

  TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {

    return const_cast<TreeEntry *>(

        getOperandEntry(const_cast<const TreeEntry *>(E), Idx));

  }


  /// Gets the root instruction for the given node. If the node is a strided

  /// load/store node with the reverse order, the root instruction is the last

  /// one.

  Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;


  /// \returns Cast context for the given graph node.

  TargetTransformInfo::CastContextHint

  getCastContextHint(const TreeEntry &TE) const;


  /// \returns the cost of the vectorizable entry.

  InstructionCost getEntryCost(const TreeEntry *E,

                               ArrayRef<Value *> VectorizedVals,

                               SmallPtrSetImpl<Value *> &CheckedExtracts);


  /// Checks if it is legal and profitable to build SplitVectorize node for the

  /// given \p VL.

  /// \param Op1 first homogeneous scalars.

  /// \param Op2 second homogeneous scalars.

  /// \param ReorderIndices indices to reorder the scalars.

  /// \returns true if the node was successfully built.

  bool canBuildSplitNode(ArrayRef<Value *> VL,

                         const InstructionsState &LocalState,

                         SmallVectorImpl<Value *> &Op1,

                         SmallVectorImpl<Value *> &Op2,

                         OrdersType &ReorderIndices) const;


  /// This is the recursive part of buildTree.

  void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,

                    unsigned InterleaveFactor = 0);


  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can

  /// be vectorized to use the original vector (or aggregate "bitcast" to a

  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise

  /// returns false, setting \p CurrentOrder to either an empty vector or a

  /// non-identity permutation that allows to reuse extract instructions.

  /// \param ResizeAllowed indicates whether it is allowed to handle subvector

  /// extract order.

  bool canReuseExtract(ArrayRef<Value *> VL,

                       SmallVectorImpl<unsigned> &CurrentOrder,

                       bool ResizeAllowed = false) const;


  /// Vectorize a single entry in the tree.

  Value *vectorizeTree(TreeEntry *E);


  /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry

  /// \p E.

  Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);


  /// Create a new vector from a list of scalar values.  Produces a sequence

  /// which exploits values reused across lanes, and arranges the inserts

  /// for ease of later optimization.

  template <typename BVTy, typename ResTy, typename... Args>

  ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);


  /// Create a new vector from a list of scalar values.  Produces a sequence

  /// which exploits values reused across lanes, and arranges the inserts

  /// for ease of later optimization.

  Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);


  /// Returns the instruction in the bundle, which can be used as a base point

  /// for scheduling. Usually it is the last instruction in the bundle, except

  /// for the case when all operands are external (in this case, it is the first

  /// instruction in the list).

  Instruction &getLastInstructionInBundle(const TreeEntry *E);


  /// Tries to find extractelement instructions with constant indices from fixed

  /// vector type and gather such instructions into a bunch, which highly likely

  /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

  /// was successful, the matched scalars are replaced by poison values in \p VL

  /// for future analysis.

  std::optional<TargetTransformInfo::ShuffleKind>

  tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,

                                           SmallVectorImpl<int> &Mask) const;


  /// Tries to find extractelement instructions with constant indices from fixed

  /// vector type and gather such instructions into a bunch, which highly likely

  /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

  /// was successful, the matched scalars are replaced by poison values in \p VL

  /// for future analysis.

  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

  tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

                             SmallVectorImpl<int> &Mask,

                             unsigned NumParts) const;


  /// Checks if the gathered \p VL can be represented as a single register

  /// shuffle(s) of previous tree entries.

  /// \param TE Tree entry checked for permutation.

  /// \param VL List of scalars (a subset of the TE scalar), checked for

  /// permutations. Must form single-register vector.

  /// \param ForOrder Tries to fetch the best candidates for ordering info. Also

  /// commands to build the mask using the original vector value, without

  /// relying on the potential reordering.

  /// \returns ShuffleKind, if gathered values can be represented as shuffles of

  /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.

  std::optional<TargetTransformInfo::ShuffleKind>

  isGatherShuffledSingleRegisterEntry(

      const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,

      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,

      bool ForOrder);


  /// Checks if the gathered \p VL can be represented as multi-register

  /// shuffle(s) of previous tree entries.

  /// \param TE Tree entry checked for permutation.

  /// \param VL List of scalars (a subset of the TE scalar), checked for

  /// permutations.

  /// \param ForOrder Tries to fetch the best candidates for ordering info. Also

  /// commands to build the mask using the original vector value, without

  /// relying on the potential reordering.

  /// \returns per-register series of ShuffleKind, if gathered values can be

  /// represented as shuffles of previous tree entries. \p Mask is filled with

  /// the shuffle mask (also on per-register base).

  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

  isGatherShuffledEntry(

      const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,

      SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,

      unsigned NumParts, bool ForOrder = false);


  /// \returns the cost of gathering (inserting) the values in \p VL into a

  /// vector.

  /// \param ForPoisonSrc true if initial vector is poison, false otherwise.

  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,

                                Type *ScalarTy) const;


  /// Set the Builder insert point to one after the last instruction in

  /// the bundle

  void setInsertPointAfterBundle(const TreeEntry *E);


  /// \returns a vector from a collection of scalars in \p VL. if \p Root is not

  /// specified, the starting vector value is poison.

  Value *

  gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,

         function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);


  /// \returns whether the VectorizableTree is fully vectorizable and will

  /// be beneficial even the tree height is tiny.

  bool isFullyVectorizableTinyTree(bool ForReduction) const;


  /// Run through the list of all gathered loads in the graph and try to find

  /// vector loads/masked gathers instead of regular gathers. Later these loads

  /// are reshufled to build final gathered nodes.

  void tryToVectorizeGatheredLoads(

      const SmallMapVector<

          std::tuple<BasicBlock *, Value *, Type *>,

          SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>

          &GatheredLoads);


  /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the

  /// users of \p TE and collects the stores. It returns the map from the store

  /// pointers to the collected stores.

  SmallVector<SmallVector<StoreInst *>>

  collectUserStores(const BoUpSLP::TreeEntry *TE) const;


  /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the

  /// stores in \p StoresVec can form a vector instruction. If so it returns

  /// true and populates \p ReorderIndices with the shuffle indices of the

  /// stores when compared to the sorted vector.

  bool canFormVector(ArrayRef<StoreInst *> StoresVec,

                     OrdersType &ReorderIndices) const;


  /// Iterates through the users of \p TE, looking for scalar stores that can be

  /// potentially vectorized in a future SLP-tree. If found, it keeps track of

  /// their order and builds an order index vector for each store bundle. It

  /// returns all these order vectors found.

  /// We run this after the tree has formed, otherwise we may come across user

  /// instructions that are not yet in the tree.

  SmallVector<OrdersType, 1>

  findExternalStoreUsersReorderIndices(TreeEntry *TE) const;


  /// Tries to reorder the gathering node for better vectorization

  /// opportunities.

  void reorderGatherNode(TreeEntry &TE);


  class TreeEntry {

  public:

    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;

    TreeEntry(VecTreeTy &Container) : Container(Container) {}


    /// \returns Common mask for reorder indices and reused scalars.

    SmallVector<int> getCommonMask() const {

      if (State == TreeEntry::SplitVectorize)

        return {};

      SmallVector<int> Mask;

      inversePermutation(ReorderIndices, Mask);

      ::addMask(Mask, ReuseShuffleIndices);

      return Mask;

    }


    /// \returns The mask for split nodes.

    SmallVector<int> getSplitMask() const {

      assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&

             "Expected only split vectorize node.");

      SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);

      unsigned CommonVF = std::max<unsigned>(

          CombinedEntriesWithIndices.back().second,

          Scalars.size() - CombinedEntriesWithIndices.back().second);

      for (auto [Idx, I] : enumerate(ReorderIndices))

        Mask[I] =

            Idx + (Idx >= CombinedEntriesWithIndices.back().second

                       ? CommonVF - CombinedEntriesWithIndices.back().second

                       : 0);

      return Mask;

    }


    /// Updates (reorders) SplitVectorize node according to the given mask \p

    /// Mask and order \p MaskOrder.

    void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,

                          ArrayRef<int> MaskOrder);


    /// \returns true if the scalars in VL are equal to this entry.

    bool isSame(ArrayRef<Value *> VL) const {

      auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {

        if (Mask.size() != VL.size() && VL.size() == Scalars.size())

          return std::equal(VL.begin(), VL.end(), Scalars.begin());

        return VL.size() == Mask.size() &&

               std::equal(VL.begin(), VL.end(), Mask.begin(),

                          [Scalars](Value *V, int Idx) {

                            return (isa<UndefValue>(V) &&

                                    Idx == PoisonMaskElem) ||

                                   (Idx != PoisonMaskElem && V == Scalars[Idx]);

                          });

      };

      if (!ReorderIndices.empty()) {

        // TODO: implement matching if the nodes are just reordered, still can

        // treat the vector as the same if the list of scalars matches VL

        // directly, without reordering.

        SmallVector<int> Mask;

        inversePermutation(ReorderIndices, Mask);

        if (VL.size() == Scalars.size())

          return IsSame(Scalars, Mask);

        if (VL.size() == ReuseShuffleIndices.size()) {

          ::addMask(Mask, ReuseShuffleIndices);

          return IsSame(Scalars, Mask);

        }

        return false;

      }

      return IsSame(Scalars, ReuseShuffleIndices);

    }


    /// \returns true if current entry has same operands as \p TE.

    bool hasEqualOperands(const TreeEntry &TE) const {

      if (TE.getNumOperands() != getNumOperands())

        return false;

      SmallBitVector Used(getNumOperands());

      for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {

        unsigned PrevCount = Used.count();

        for (unsigned K = 0; K < E; ++K) {

          if (Used.test(K))

            continue;

          if (getOperand(K) == TE.getOperand(I)) {

            Used.set(K);

            break;

          }

        }

        // Check if we actually found the matching operand.

        if (PrevCount == Used.count())

          return false;

      }

      return true;

    }


    /// \return Final vectorization factor for the node. Defined by the total

    /// number of vectorized scalars, including those, used several times in the

    /// entry and counted in the \a ReuseShuffleIndices, if any.

    unsigned getVectorFactor() const {

      if (!ReuseShuffleIndices.empty())

        return ReuseShuffleIndices.size();

      return Scalars.size();

    };


    /// Checks if the current node is a gather node.

    bool isGather() const { return State == NeedToGather; }


    /// A vector of scalars.

    ValueList Scalars;


    /// The Scalars are vectorized into this value. It is initialized to Null.

    WeakTrackingVH VectorizedValue = nullptr;


    /// Do we need to gather this sequence or vectorize it

    /// (either with vector instruction or with scatter/gather

    /// intrinsics for store/load)?

    enum EntryState {

      Vectorize,         ///< The node is regularly vectorized.

      ScatterVectorize,  ///< Masked scatter/gather node.

      StridedVectorize,  ///< Strided loads (and stores)

      CompressVectorize, ///< (Masked) load with compress.

      NeedToGather,      ///< Gather/buildvector node.

      CombinedVectorize, ///< Vectorized node, combined with its user into more

                         ///< complex node like select/cmp to minmax, mul/add to

                         ///< fma, etc. Must be used for the following nodes in

                         ///< the pattern, not the very first one.

      SplitVectorize,    ///< Splits the node into 2 subnodes, vectorizes them

                         ///< independently and then combines back.

    };

    EntryState State;


    /// List of combined opcodes supported by the vectorizer.

    enum CombinedOpcode {

      NotCombinedOp = -1,

      MinMax = Instruction::OtherOpsEnd + 1,

      FMulAdd,

    };

    CombinedOpcode CombinedOp = NotCombinedOp;


    /// Does this sequence require some shuffling?

    SmallVector<int, 4> ReuseShuffleIndices;


    /// Does this entry require reordering?

    SmallVector<unsigned, 4> ReorderIndices;


    /// Points back to the VectorizableTree.

    ///

    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has

    /// to be a pointer and needs to be able to initialize the child iterator.

    /// Thus we need a reference back to the container to translate the indices

    /// to entries.

    VecTreeTy &Container;


    /// The TreeEntry index containing the user of this entry.

    EdgeInfo UserTreeIndex;


    /// The index of this treeEntry in VectorizableTree.

    unsigned Idx = 0;


    /// For gather/buildvector/alt opcode nodes, which are combined from

    /// other nodes as a series of insertvector instructions.

    SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;


  private:

    /// The operands of each instruction in each lane Operands[op_index][lane].

    /// Note: This helps avoid the replication of the code that performs the

    /// reordering of operands during buildTreeRec() and vectorizeTree().

    SmallVector<ValueList, 2> Operands;


    /// Copyable elements of the entry node.

    SmallPtrSet<const Value *, 4> CopyableElements;


    /// MainOp and AltOp are recorded inside. S should be obtained from

    /// newTreeEntry.

    InstructionsState S = InstructionsState::invalid();


    /// Interleaving factor for interleaved loads Vectorize nodes.

    unsigned InterleaveFactor = 0;


    /// True if the node does not require scheduling.

    bool DoesNotNeedToSchedule = false;


    /// Set this bundle's \p OpIdx'th operand to \p OpVL.

    void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {

      if (Operands.size() < OpIdx + 1)

        Operands.resize(OpIdx + 1);

      assert(Operands[OpIdx].empty() && "Already resized?");

      assert(OpVL.size() <= Scalars.size() &&

             "Number of operands is greater than the number of scalars.");

      Operands[OpIdx].resize(OpVL.size());

      copy(OpVL, Operands[OpIdx].begin());

    }


  public:

    /// Returns interleave factor for interleave nodes.

    unsigned getInterleaveFactor() const { return InterleaveFactor; }

    /// Sets interleaving factor for the interleaving nodes.

    void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }


    /// Marks the node as one that does not require scheduling.

    void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }

    /// Returns true if the node is marked as one that does not require

    /// scheduling.

    bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }


    /// Set this bundle's operands from \p Operands.

    void setOperands(ArrayRef<ValueList> Operands) {

      for (unsigned I : seq<unsigned>(Operands.size()))

        setOperand(I, Operands[I]);

    }


    /// Reorders operands of the node to the given mask \p Mask.

    void reorderOperands(ArrayRef<int> Mask) {

      for (ValueList &Operand : Operands)

        reorderScalars(Operand, Mask);

    }


    /// \returns the \p OpIdx operand of this TreeEntry.

    ValueList &getOperand(unsigned OpIdx) {

      assert(OpIdx < Operands.size() && "Off bounds");

      return Operands[OpIdx];

    }


    /// \returns the \p OpIdx operand of this TreeEntry.

    ArrayRef<Value *> getOperand(unsigned OpIdx) const {

      assert(OpIdx < Operands.size() && "Off bounds");

      return Operands[OpIdx];

    }


    /// \returns the number of operands.

    unsigned getNumOperands() const { return Operands.size(); }


    /// \return the single \p OpIdx operand.

    Value *getSingleOperand(unsigned OpIdx) const {

      assert(OpIdx < Operands.size() && "Off bounds");

      assert(!Operands[OpIdx].empty() && "No operand available");

      return Operands[OpIdx][0];

    }


    /// Some of the instructions in the list have alternate opcodes.

    bool isAltShuffle() const { return S.isAltShuffle(); }


    Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {

      return S.getMatchingMainOpOrAltOp(I);

    }


    /// Chooses the correct key for scheduling data. If \p Op has the same (or

    /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is

    /// \p OpValue.

    Value *isOneOf(Value *Op) const {

      auto *I = dyn_cast<Instruction>(Op);

      if (I && getMatchingMainOpOrAltOp(I))

        return Op;

      return S.getMainOp();

    }


    void setOperations(const InstructionsState &S) {

      assert(S && "InstructionsState is invalid.");

      this->S = S;

    }


    Instruction *getMainOp() const { return S.getMainOp(); }


    Instruction *getAltOp() const { return S.getAltOp(); }


    /// The main/alternate opcodes for the list of instructions.

    unsigned getOpcode() const { return S.getOpcode(); }


    unsigned getAltOpcode() const { return S.getAltOpcode(); }


    bool hasState() const { return S.valid(); }


    /// Add \p V to the list of copyable elements.

    void addCopyableElement(Value *V) {

      assert(S.isCopyableElement(V) && "Not a copyable element.");

      CopyableElements.insert(V);

    }


    /// Returns true if \p V is a copyable element.

    bool isCopyableElement(Value *V) const {

      return CopyableElements.contains(V);

    }


    /// Returns true if any scalar in the list is a copyable element.

    bool hasCopyableElements() const { return !CopyableElements.empty(); }


    /// Returns the state of the operations.

    const InstructionsState &getOperations() const { return S; }


    /// When ReuseReorderShuffleIndices is empty it just returns position of \p

    /// V within vector of Scalars. Otherwise, try to remap on its reuse index.

    unsigned findLaneForValue(Value *V) const {

      unsigned FoundLane = getVectorFactor();

      for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;

           std::advance(It, 1)) {

        if (*It != V)

          continue;

        FoundLane = std::distance(Scalars.begin(), It);

        assert(FoundLane < Scalars.size() && "Couldn't find extract lane");

        if (!ReorderIndices.empty())

          FoundLane = ReorderIndices[FoundLane];

        assert(FoundLane < Scalars.size() && "Couldn't find extract lane");

        if (ReuseShuffleIndices.empty())

          break;

        if (auto *RIt = find(ReuseShuffleIndices, FoundLane);

            RIt != ReuseShuffleIndices.end()) {

          FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);

          break;

        }

      }

      assert(FoundLane < getVectorFactor() && "Unable to find given value.");

      return FoundLane;

    }


    /// Build a shuffle mask for graph entry which represents a merge of main

    /// and alternate operations.

    void

    buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,

                          SmallVectorImpl<int> &Mask,

                          SmallVectorImpl<Value *> *OpScalars = nullptr,

                          SmallVectorImpl<Value *> *AltScalars = nullptr) const;


    /// Return true if this is a non-power-of-2 node.

    bool isNonPowOf2Vec() const {

      bool IsNonPowerOf2 = !has_single_bit(Scalars.size());

      return IsNonPowerOf2;

    }


    /// Return true if this is a node, which tries to vectorize number of

    /// elements, forming whole vectors.

    bool

    hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {

      bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(

          TTI, getValueType(Scalars.front()), Scalars.size());

      assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&

             "Reshuffling not supported with non-power-of-2 vectors yet.");

      return IsNonPowerOf2;

    }


    Value *getOrdered(unsigned Idx) const {

      assert(isGather() && "Must be used only for buildvectors/gathers.");

      if (ReorderIndices.empty())

        return Scalars[Idx];

      SmallVector<int> Mask;

      inversePermutation(ReorderIndices, Mask);

      return Scalars[Mask[Idx]];

    }


#ifndef NDEBUG

    /// Debug printer.

    LLVM_DUMP_METHOD void dump() const {

      dbgs() << Idx << ".\n";

      for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {

        dbgs() << "Operand " << OpI << ":\n";

        for (const Value *V : Operands[OpI])

          dbgs().indent(2) << *V << "\n";

      }

      dbgs() << "Scalars: \n";

      for (Value *V : Scalars)

        dbgs().indent(2) << *V << "\n";

      dbgs() << "State: ";

      if (S && hasCopyableElements())

        dbgs() << "[[Copyable]] ";

      switch (State) {

      case Vectorize:

        if (InterleaveFactor > 0) {

          dbgs() << "Vectorize with interleave factor " << InterleaveFactor

                 << "\n";

        } else {

          dbgs() << "Vectorize\n";

        }

        break;

      case ScatterVectorize:

        dbgs() << "ScatterVectorize\n";

        break;

      case StridedVectorize:

        dbgs() << "StridedVectorize\n";

        break;

      case CompressVectorize:

        dbgs() << "CompressVectorize\n";

        break;

      case NeedToGather:

        dbgs() << "NeedToGather\n";

        break;

      case CombinedVectorize:

        dbgs() << "CombinedVectorize\n";

        break;

      case SplitVectorize:

        dbgs() << "SplitVectorize\n";

        break;

      }

      if (S) {

        dbgs() << "MainOp: " << *S.getMainOp() << "\n";

        dbgs() << "AltOp: " << *S.getAltOp() << "\n";

      } else {

        dbgs() << "MainOp: NULL\n";

        dbgs() << "AltOp: NULL\n";

      }

      dbgs() << "VectorizedValue: ";

      if (VectorizedValue)

        dbgs() << *VectorizedValue << "\n";

      else

        dbgs() << "NULL\n";

      dbgs() << "ReuseShuffleIndices: ";

      if (ReuseShuffleIndices.empty())

        dbgs() << "Empty";

      else

        for (int ReuseIdx : ReuseShuffleIndices)

          dbgs() << ReuseIdx << ", ";

      dbgs() << "\n";

      dbgs() << "ReorderIndices: ";

      for (unsigned ReorderIdx : ReorderIndices)

        dbgs() << ReorderIdx << ", ";

      dbgs() << "\n";

      dbgs() << "UserTreeIndex: ";

      if (UserTreeIndex)

        dbgs() << UserTreeIndex;

      else

        dbgs() << "<invalid>";

      dbgs() << "\n";

      if (!CombinedEntriesWithIndices.empty()) {

        dbgs() << "Combined entries: ";

        interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {

          dbgs() << "Entry index " << P.first << " with offset " << P.second;

        });

        dbgs() << "\n";

      }

    }

#endif

  };


#ifndef NDEBUG

  void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,

                     InstructionCost VecCost, InstructionCost ScalarCost,

                     StringRef Banner) const {

    dbgs() << "SLP: " << Banner << ":\n";

    E->dump();

    dbgs() << "SLP: Costs:\n";

    dbgs() << "SLP:     ReuseShuffleCost = " << ReuseShuffleCost << "\n";

    dbgs() << "SLP:     VectorCost = " << VecCost << "\n";

    dbgs() << "SLP:     ScalarCost = " << ScalarCost << "\n";

    dbgs() << "SLP:     ReuseShuffleCost + VecCost - ScalarCost = "

           << ReuseShuffleCost + VecCost - ScalarCost << "\n";

  }

#endif


  /// Create a new gather TreeEntry

  TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,

                                const InstructionsState &S,

                                const EdgeInfo &UserTreeIdx,

                                ArrayRef<int> ReuseShuffleIndices = {}) {

    auto Invalid = ScheduleBundle::invalid();

    return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);

  }


  /// Create a new VectorizableTree entry.

  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,

                          const InstructionsState &S,

                          const EdgeInfo &UserTreeIdx,

                          ArrayRef<int> ReuseShuffleIndices = {},

                          ArrayRef<unsigned> ReorderIndices = {},

                          unsigned InterleaveFactor = 0) {

    TreeEntry::EntryState EntryState =

        Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;

    TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,

                                ReuseShuffleIndices, ReorderIndices);

    if (E && InterleaveFactor > 0)

      E->setInterleave(InterleaveFactor);

    return E;

  }


  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,

                          TreeEntry::EntryState EntryState,

                          ScheduleBundle &Bundle, const InstructionsState &S,

                          const EdgeInfo &UserTreeIdx,

                          ArrayRef<int> ReuseShuffleIndices = {},

                          ArrayRef<unsigned> ReorderIndices = {}) {

    assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||

                         EntryState == TreeEntry::SplitVectorize)) ||

            (Bundle && EntryState != TreeEntry::NeedToGather &&

             EntryState != TreeEntry::SplitVectorize)) &&

           "Need to vectorize gather entry?");

    // Gathered loads still gathered? Do not create entry, use the original one.

    if (GatheredLoadsEntriesFirst.has_value() &&

        EntryState == TreeEntry::NeedToGather && S &&

        S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&

        !UserTreeIdx.UserTE)

      return nullptr;

    VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));

    TreeEntry *Last = VectorizableTree.back().get();

    Last->Idx = VectorizableTree.size() - 1;

    Last->State = EntryState;

    if (UserTreeIdx.UserTE)

      OperandsToTreeEntry.try_emplace(

          std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);

    // FIXME: Remove once support for ReuseShuffleIndices has been implemented

    // for non-power-of-two vectors.

    assert(

        (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||

         ReuseShuffleIndices.empty()) &&

        "Reshuffling scalars not yet supported for nodes with padding");

    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

                                     ReuseShuffleIndices.end());

    if (ReorderIndices.empty()) {

      Last->Scalars.assign(VL.begin(), VL.end());

      if (S)

        Last->setOperations(S);

    } else {

      // Reorder scalars and build final mask.

      Last->Scalars.assign(VL.size(), nullptr);

      transform(ReorderIndices, Last->Scalars.begin(),

                [VL](unsigned Idx) -> Value * {

                  if (Idx >= VL.size())

                    return UndefValue::get(VL.front()->getType());

                  return VL[Idx];

                });

      InstructionsState S = getSameOpcode(Last->Scalars, *TLI);

      if (S)

        Last->setOperations(S);

      Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());

    }

    if (EntryState == TreeEntry::SplitVectorize) {

      assert(S && "Split nodes must have operations.");

      Last->setOperations(S);

      SmallPtrSet<Value *, 4> Processed;

      for (Value *V : VL) {

        auto *I = dyn_cast<Instruction>(V);

        if (!I)

          continue;

        auto It = ScalarsInSplitNodes.find(V);

        if (It == ScalarsInSplitNodes.end()) {

          ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);

          (void)Processed.insert(V);

        } else if (Processed.insert(V).second) {

          assert(!is_contained(It->getSecond(), Last) &&

                 "Value already associated with the node.");

          It->getSecond().push_back(Last);

        }

      }

    } else if (!Last->isGather()) {

      if (isa<PHINode>(S.getMainOp()) ||

          isVectorLikeInstWithConstOps(S.getMainOp()) ||

          (!S.areInstructionsWithCopyableElements() &&

           doesNotNeedToSchedule(VL)) ||

          all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))

        Last->setDoesNotNeedToSchedule();

      SmallPtrSet<Value *, 4> Processed;

      for (Value *V : VL) {

        if (isa<PoisonValue>(V))

          continue;

        if (S.isCopyableElement(V)) {

          Last->addCopyableElement(V);

          continue;

        }

        auto It = ScalarToTreeEntries.find(V);

        if (It == ScalarToTreeEntries.end()) {

          ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);

          (void)Processed.insert(V);

        } else if (Processed.insert(V).second) {

          assert(!is_contained(It->getSecond(), Last) &&

                 "Value already associated with the node.");

          It->getSecond().push_back(Last);

        }

      }

      // Update the scheduler bundle to point to this TreeEntry.

      assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&

             "Bundle and VL out of sync");

      if (!Bundle.getBundle().empty()) {

#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)

        auto *BundleMember = Bundle.getBundle().begin();

        SmallPtrSet<Value *, 4> Processed;

        for (Value *V : VL) {

          if (S.isNonSchedulable(V) || !Processed.insert(V).second)

            continue;

          ++BundleMember;

        }

        assert(BundleMember == Bundle.getBundle().end() &&

               "Bundle and VL out of sync");

#endif

        Bundle.setTreeEntry(Last);

      }

    } else {

      // Build a map for gathered scalars to the nodes where they are used.

      bool AllConstsOrCasts = true;

      for (Value *V : VL) {

        if (S && S.areInstructionsWithCopyableElements() &&

            S.isCopyableElement(V))

          Last->addCopyableElement(V);

        if (!isConstant(V)) {

          auto *I = dyn_cast<CastInst>(V);

          AllConstsOrCasts &= I && I->getType()->isIntegerTy();

          if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||

              !UserTreeIdx.UserTE->isGather())

            ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);

        }

      }

      if (AllConstsOrCasts)

        CastMaxMinBWSizes =

            std::make_pair(std::numeric_limits<unsigned>::max(), 1);

      MustGather.insert_range(VL);

    }


    if (UserTreeIdx.UserTE)

      Last->UserTreeIndex = UserTreeIdx;

    return Last;

  }


  /// -- Vectorization State --

  /// Holds all of the tree entries.

  TreeEntry::VecTreeTy VectorizableTree;


#ifndef NDEBUG

  /// Debug printer.

  LLVM_DUMP_METHOD void dumpVectorizableTree() const {

    for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {

      VectorizableTree[Id]->dump();

      dbgs() << "\n";

    }

  }

#endif


  /// Get list of vector entries, associated with the value \p V.

  ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {

    assert(V && "V cannot be nullptr.");

    auto It = ScalarToTreeEntries.find(V);

    if (It == ScalarToTreeEntries.end())

      return {};

    return It->getSecond();

  }


  /// Get list of split vector entries, associated with the value \p V.

  ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {

    assert(V && "V cannot be nullptr.");

    auto It = ScalarsInSplitNodes.find(V);

    if (It == ScalarsInSplitNodes.end())

      return {};

    return It->getSecond();

  }


  /// Returns first vector node for value \p V, matching values \p VL.

  TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,

                                    bool SameVF = false) const {

    assert(V && "V cannot be nullptr.");

    for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))

      if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))

        return TE;

    return nullptr;

  }


  /// Check that the operand node of alternate node does not generate

  /// buildvector sequence. If it is, then probably not worth it to build

  /// alternate shuffle, if number of buildvector operands + alternate

  /// instruction > than the number of buildvector instructions.

  /// \param S the instructions state of the analyzed values.

  /// \param VL list of the instructions with alternate opcodes.

  bool areAltOperandsProfitable(const InstructionsState &S,

                                ArrayRef<Value *> VL) const;


  /// Contains all the outputs of legality analysis for a list of values to

  /// vectorize.

  class ScalarsVectorizationLegality {

    InstructionsState S;

    bool IsLegal;

    bool TryToFindDuplicates;

    bool TrySplitVectorize;


  public:

    ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,

                                 bool TryToFindDuplicates = true,

                                 bool TrySplitVectorize = false)

        : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),

          TrySplitVectorize(TrySplitVectorize) {

      assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&

             "Inconsistent state");

    }

    const InstructionsState &getInstructionsState() const { return S; };

    bool isLegal() const { return IsLegal; }

    bool tryToFindDuplicates() const { return TryToFindDuplicates; }

    bool trySplitVectorize() const { return TrySplitVectorize; }

  };


  /// Checks if the specified list of the instructions/values can be vectorized

  /// in general.

  ScalarsVectorizationLegality

  getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,

                                  const EdgeInfo &UserTreeIdx,

                                  bool TryCopyableElementsVectorization) const;


  /// Checks if the specified list of the instructions/values can be vectorized

  /// and fills required data before actual scheduling of the instructions.

  TreeEntry::EntryState

  getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,

                               bool IsScatterVectorizeUserTE,

                               OrdersType &CurrentOrder,

                               SmallVectorImpl<Value *> &PointerOps);


  /// Maps a specific scalar to its tree entry(ies).

  SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;


  /// Maps the operand index and entry to the corresponding tree entry.

  SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>

      OperandsToTreeEntry;


  /// Scalars, used in split vectorize nodes.

  SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;


  /// Maps a value to the proposed vectorizable size.

  SmallDenseMap<Value *, unsigned> InstrElementSize;


  /// A list of scalars that we found that we need to keep as scalars.

  ValueSet MustGather;


  /// A set of first non-schedulable values.

  ValueSet NonScheduledFirst;


  /// A map between the vectorized entries and the last instructions in the

  /// bundles. The bundles are built in use order, not in the def order of the

  /// instructions. So, we cannot rely directly on the last instruction in the

  /// bundle being the last instruction in the program order during

  /// vectorization process since the basic blocks are affected, need to

  /// pre-gather them before.

  SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;


  /// List of gather nodes, depending on other gather/vector nodes, which should

  /// be emitted after the vector instruction emission process to correctly

  /// handle order of the vector instructions and shuffles.

  SetVector<const TreeEntry *> PostponedGathers;


  using ValueToGatherNodesMap =

      DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;

  ValueToGatherNodesMap ValueToGatherNodes;


  /// A list of the load entries (node indices), which can be vectorized using

  /// strided or masked gather approach, but attempted to be represented as

  /// contiguous loads.

  SetVector<unsigned> LoadEntriesToVectorize;


  /// true if graph nodes transforming mode is on.

  bool IsGraphTransformMode = false;


  /// The index of the first gathered load entry in the VectorizeTree.

  std::optional<unsigned> GatheredLoadsEntriesFirst;


  /// Maps compress entries to their mask data for the final codegen.

  SmallDenseMap<const TreeEntry *,

                std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>

      CompressEntryToData;


  /// This POD struct describes one external user in the vectorized tree.

  struct ExternalUser {

    ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)

        : Scalar(S), User(U), E(E), Lane(L) {}


    /// Which scalar in our function.

    Value *Scalar = nullptr;


    /// Which user that uses the scalar.

    llvm::User *User = nullptr;


    /// Vector node, the value is part of.

    const TreeEntry &E;


    /// Which lane does the scalar belong to.

    unsigned Lane;

  };

  using UserList = SmallVector<ExternalUser, 16>;


  /// Checks if two instructions may access the same memory.

  ///

  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it

  /// is invariant in the calling loop.

  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,

                 Instruction *Inst2) {

    assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");

    // First check if the result is already in the cache.

    AliasCacheKey Key = std::make_pair(Inst1, Inst2);

    auto Res = AliasCache.try_emplace(Key);

    if (!Res.second)

      return Res.first->second;

    bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

    // Store the result in the cache.

    Res.first->getSecond() = Aliased;

    return Aliased;

  }


  using AliasCacheKey = std::pair<Instruction *, Instruction *>;


  /// Cache for alias results.

  /// TODO: consider moving this to the AliasAnalysis itself.

  SmallDenseMap<AliasCacheKey, bool> AliasCache;


  // Cache for pointerMayBeCaptured calls inside AA.  This is preserved

  // globally through SLP because we don't perform any action which

  // invalidates capture results.

  BatchAAResults BatchAA;


  /// Temporary store for deleted instructions. Instructions will be deleted

  /// eventually when the BoUpSLP is destructed.  The deferral is required to

  /// ensure that there are no incorrect collisions in the AliasCache, which

  /// can happen if a new instruction is allocated at the same address as a

  /// previously deleted instruction.

  DenseSet<Instruction *> DeletedInstructions;


  /// Set of the instruction, being analyzed already for reductions.

  SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;


  /// Set of hashes for the list of reduction values already being analyzed.

  DenseSet<size_t> AnalyzedReductionVals;


  /// Values, already been analyzed for mininmal bitwidth and found to be

  /// non-profitable.

  DenseSet<Value *> AnalyzedMinBWVals;


  /// A list of values that need to extracted out of the tree.

  /// This list holds pairs of (Internal Scalar : External User). External User

  /// can be nullptr, it means that this Internal Scalar will be used later,

  /// after vectorization.

  UserList ExternalUses;


  /// A list of GEPs which can be reaplced by scalar GEPs instead of

  /// extractelement instructions.

  SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;


  /// A list of scalar to be extracted without specific user necause of too many

  /// uses.

  SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;


  /// Values used only by @llvm.assume calls.

  SmallPtrSet<const Value *, 32> EphValues;


  /// Holds all of the instructions that we gathered, shuffle instructions and

  /// extractelements.

  SetVector<Instruction *> GatherShuffleExtractSeq;


  /// A list of blocks that we are going to CSE.

  DenseSet<BasicBlock *> CSEBlocks;


  /// List of hashes of vector of loads, which are known to be non vectorizable.

  DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;


  /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData

  /// or ScheduleBundle. ScheduleData used to gather dependecies for a single

  /// instructions, while ScheduleBundle represents a batch of instructions,

  /// going to be groupped together. ScheduleCopyableData models extra user for

  /// "copyable" instructions.

  class ScheduleEntity {

    friend class ScheduleBundle;

    friend class ScheduleData;

    friend class ScheduleCopyableData;


  protected:

    enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };

    Kind getKind() const { return K; }

    ScheduleEntity(Kind K) : K(K) {}


  private:

    /// Used for getting a "good" final ordering of instructions.

    int SchedulingPriority = 0;

    /// True if this instruction (or bundle) is scheduled (or considered as

    /// scheduled in the dry-run).

    bool IsScheduled = false;

    /// The kind of the ScheduleEntity.

    const Kind K = Kind::ScheduleData;


  public:

    ScheduleEntity() = delete;

    /// Gets/sets the scheduling priority.

    void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }

    int getSchedulingPriority() const { return SchedulingPriority; }

    bool isReady() const {

      if (const auto *SD = dyn_cast<ScheduleData>(this))

        return SD->isReady();

      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))

        return CD->isReady();

      return cast<ScheduleBundle>(this)->isReady();

    }

    /// Returns true if the dependency information has been calculated.

    /// Note that depenendency validity can vary between instructions within

    /// a single bundle.

    bool hasValidDependencies() const {

      if (const auto *SD = dyn_cast<ScheduleData>(this))

        return SD->hasValidDependencies();

      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))

        return CD->hasValidDependencies();

      return cast<ScheduleBundle>(this)->hasValidDependencies();

    }

    /// Gets the number of unscheduled dependencies.

    int getUnscheduledDeps() const {

      if (const auto *SD = dyn_cast<ScheduleData>(this))

        return SD->getUnscheduledDeps();

      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))

        return CD->getUnscheduledDeps();

      return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();

    }

    /// Increments the number of unscheduled dependencies.

    int incrementUnscheduledDeps(int Incr) {

      if (auto *SD = dyn_cast<ScheduleData>(this))

        return SD->incrementUnscheduledDeps(Incr);

      return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);

    }

    /// Gets the number of dependencies.

    int getDependencies() const {

      if (const auto *SD = dyn_cast<ScheduleData>(this))

        return SD->getDependencies();

      return cast<ScheduleCopyableData>(this)->getDependencies();

    }

    /// Gets the instruction.

    Instruction *getInst() const {

      if (const auto *SD = dyn_cast<ScheduleData>(this))

        return SD->getInst();

      return cast<ScheduleCopyableData>(this)->getInst();

    }


    /// Gets/sets if the bundle is scheduled.

    bool isScheduled() const { return IsScheduled; }

    void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }


    static bool classof(const ScheduleEntity *) { return true; }


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

    void dump(raw_ostream &OS) const {

      if (const auto *SD = dyn_cast<ScheduleData>(this))

        return SD->dump(OS);

      if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))

        return CD->dump(OS);

      return cast<ScheduleBundle>(this)->dump(OS);

    }


    LLVM_DUMP_METHOD void dump() const {

      dump(dbgs());

      dbgs() << '\n';

    }

#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

  };


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

  friend inline raw_ostream &operator<<(raw_ostream &OS,

                                        const BoUpSLP::ScheduleEntity &SE) {

    SE.dump(OS);

    return OS;

  }

#endif


  /// Contains all scheduling relevant data for an instruction.

  /// A ScheduleData either represents a single instruction or a member of an

  /// instruction bundle (= a group of instructions which is combined into a

  /// vector instruction).

  class ScheduleData final : public ScheduleEntity {

  public:

    // The initial value for the dependency counters. It means that the

    // dependencies are not calculated yet.

    enum { InvalidDeps = -1 };


    ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}

    static bool classof(const ScheduleEntity *Entity) {

      return Entity->getKind() == Kind::ScheduleData;

    }


    void init(int BlockSchedulingRegionID, Instruction *I) {

      NextLoadStore = nullptr;

      IsScheduled = false;

      SchedulingRegionID = BlockSchedulingRegionID;

      clearDependencies();

      Inst = I;

    }


    /// Verify basic self consistency properties

    void verify() {

      if (hasValidDependencies()) {

        assert(UnscheduledDeps <= Dependencies && "invariant");

      } else {

        assert(UnscheduledDeps == Dependencies && "invariant");

      }


      if (IsScheduled) {

        assert(hasValidDependencies() && UnscheduledDeps == 0 &&

               "unexpected scheduled state");

      }

    }


    /// Returns true if the dependency information has been calculated.

    /// Note that depenendency validity can vary between instructions within

    /// a single bundle.

    bool hasValidDependencies() const { return Dependencies != InvalidDeps; }


    /// Returns true if it is ready for scheduling, i.e. it has no more

    /// unscheduled depending instructions/bundles.

    bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }


    /// Modifies the number of unscheduled dependencies for this instruction,

    /// and returns the number of remaining dependencies for the containing

    /// bundle.

    int incrementUnscheduledDeps(int Incr) {

      assert(hasValidDependencies() &&

             "increment of unscheduled deps would be meaningless");

      UnscheduledDeps += Incr;

      return UnscheduledDeps;

    }


    /// Sets the number of unscheduled dependencies to the number of

    /// dependencies.

    void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }


    /// Clears all dependency information.

    void clearDependencies() {

      clearDirectDependencies();

      MemoryDependencies.clear();

      ControlDependencies.clear();

    }


    /// Clears all direct dependencies only, except for control and memory

    /// dependencies.

    /// Required for copyable elements to correctly handle control/memory deps

    /// and avoid extra reclaculation of such deps.

    void clearDirectDependencies() {

      Dependencies = InvalidDeps;

      resetUnscheduledDeps();

      IsScheduled = false;

    }


    /// Gets the number of unscheduled dependencies.

    int getUnscheduledDeps() const { return UnscheduledDeps; }

    /// Gets the number of dependencies.

    int getDependencies() const { return Dependencies; }

    /// Initializes the number of dependencies.

    void initDependencies() { Dependencies = 0; }

    /// Increments the number of dependencies.

    void incDependencies() { Dependencies++; }


    /// Gets scheduling region ID.

    int getSchedulingRegionID() const { return SchedulingRegionID; }


    /// Gets the instruction.

    Instruction *getInst() const { return Inst; }


    /// Gets the list of memory dependencies.

    ArrayRef<ScheduleData *> getMemoryDependencies() const {

      return MemoryDependencies;

    }

    /// Adds a memory dependency.

    void addMemoryDependency(ScheduleData *Dep) {

      MemoryDependencies.push_back(Dep);

    }

    /// Gets the list of control dependencies.

    ArrayRef<ScheduleData *> getControlDependencies() const {

      return ControlDependencies;

    }

    /// Adds a control dependency.

    void addControlDependency(ScheduleData *Dep) {

      ControlDependencies.push_back(Dep);

    }

    /// Gets/sets the next load/store instruction in the block.

    ScheduleData *getNextLoadStore() const { return NextLoadStore; }

    void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }


    void dump(raw_ostream &OS) const { OS << *Inst; }


    LLVM_DUMP_METHOD void dump() const {

      dump(dbgs());

      dbgs() << '\n';

    }


  private:

    Instruction *Inst = nullptr;


    /// Single linked list of all memory instructions (e.g. load, store, call)

    /// in the block - until the end of the scheduling region.

    ScheduleData *NextLoadStore = nullptr;


    /// The dependent memory instructions.

    /// This list is derived on demand in calculateDependencies().

    SmallVector<ScheduleData *> MemoryDependencies;


    /// List of instructions which this instruction could be control dependent

    /// on.  Allowing such nodes to be scheduled below this one could introduce

    /// a runtime fault which didn't exist in the original program.

    /// ex: this is a load or udiv following a readonly call which inf loops

    SmallVector<ScheduleData *> ControlDependencies;


    /// This ScheduleData is in the current scheduling region if this matches

    /// the current SchedulingRegionID of BlockScheduling.

    int SchedulingRegionID = 0;


    /// The number of dependencies. Constitutes of the number of users of the

    /// instruction plus the number of dependent memory instructions (if any).

    /// This value is calculated on demand.

    /// If InvalidDeps, the number of dependencies is not calculated yet.

    int Dependencies = InvalidDeps;


    /// The number of dependencies minus the number of dependencies of scheduled

    /// instructions. As soon as this is zero, the instruction/bundle gets ready

    /// for scheduling.

    /// Note that this is negative as long as Dependencies is not calculated.

    int UnscheduledDeps = InvalidDeps;

  };


#ifndef NDEBUG

  friend inline raw_ostream &operator<<(raw_ostream &OS,

                                        const BoUpSLP::ScheduleData &SD) {

    SD.dump(OS);

    return OS;

  }

#endif


  class ScheduleBundle final : public ScheduleEntity {

    /// The schedule data for the instructions in the bundle.

    SmallVector<ScheduleEntity *> Bundle;

    /// True if this bundle is valid.

    bool IsValid = true;

    /// The TreeEntry that this instruction corresponds to.

    TreeEntry *TE = nullptr;

    ScheduleBundle(bool IsValid)

        : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}


  public:

    ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}

    static bool classof(const ScheduleEntity *Entity) {

      return Entity->getKind() == Kind::ScheduleBundle;

    }


    /// Verify basic self consistency properties

    void verify() const {

      for (const ScheduleEntity *SD : Bundle) {

        if (SD->hasValidDependencies()) {

          assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&

                 "invariant");

        } else {

          assert(SD->getUnscheduledDeps() == SD->getDependencies() &&

                 "invariant");

        }


        if (isScheduled()) {

          assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&

                 "unexpected scheduled state");

        }

      }

    }


    /// Returns the number of unscheduled dependencies in the bundle.

    int unscheduledDepsInBundle() const {

      assert(*this && "bundle must not be empty");

      int Sum = 0;

      for (const ScheduleEntity *BundleMember : Bundle) {

        if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)

          return ScheduleData::InvalidDeps;

        Sum += BundleMember->getUnscheduledDeps();

      }

      return Sum;

    }


    /// Returns true if the dependency information has been calculated.

    /// Note that depenendency validity can vary between instructions within

    /// a single bundle.

    bool hasValidDependencies() const {

      return all_of(Bundle, [](const ScheduleEntity *SD) {

        return SD->hasValidDependencies();

      });

    }


    /// Returns true if it is ready for scheduling, i.e. it has no more

    /// unscheduled depending instructions/bundles.

    bool isReady() const {

      assert(*this && "bundle must not be empty");

      return unscheduledDepsInBundle() == 0 && !isScheduled();

    }


    /// Returns the bundle of scheduling data, associated with the current

    /// instruction.

    ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }

    ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }

    /// Adds an instruction to the bundle.

    void add(ScheduleEntity *SD) { Bundle.push_back(SD); }


    /// Gets/sets the associated tree entry.

    void setTreeEntry(TreeEntry *TE) { this->TE = TE; }

    TreeEntry *getTreeEntry() const { return TE; }


    static ScheduleBundle invalid() { return {false}; }


    operator bool() const { return IsValid; }


#ifndef NDEBUG

    void dump(raw_ostream &OS) const {

      if (!*this) {

        OS << "[]";

        return;

      }

      OS << '[';

      interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {

        if (isa<ScheduleCopyableData>(SD))

          OS << "<Copyable>";

        OS << *SD->getInst();

      });

      OS << ']';

    }


    LLVM_DUMP_METHOD void dump() const {

      dump(dbgs());

      dbgs() << '\n';

    }

#endif // NDEBUG

  };


#ifndef NDEBUG

  friend inline raw_ostream &operator<<(raw_ostream &OS,

                                        const BoUpSLP::ScheduleBundle &Bundle) {

    Bundle.dump(OS);

    return OS;

  }

#endif


  /// Contains all scheduling relevant data for the copyable instruction.

  /// It models the virtual instructions, supposed to replace the original

  /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,

  /// %1], where %1 = add, then the ScheduleCopyableData models virtual

  /// instruction %virt = add %0, 0.

  class ScheduleCopyableData final : public ScheduleEntity {

    /// The source schedule data for the instruction.

    Instruction *Inst = nullptr;

    /// The edge information for the instruction.

    const EdgeInfo EI;

    /// This ScheduleData is in the current scheduling region if this matches

    /// the current SchedulingRegionID of BlockScheduling.

    int SchedulingRegionID = 0;

    /// Bundle, this data is part of.

    ScheduleBundle &Bundle;


  public:

    ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,

                         const EdgeInfo &EI, ScheduleBundle &Bundle)

        : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),

          SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}

    static bool classof(const ScheduleEntity *Entity) {

      return Entity->getKind() == Kind::ScheduleCopyableData;

    }


    /// Verify basic self consistency properties

    void verify() {

      if (hasValidDependencies()) {

        assert(UnscheduledDeps <= Dependencies && "invariant");

      } else {

        assert(UnscheduledDeps == Dependencies && "invariant");

      }


      if (IsScheduled) {

        assert(hasValidDependencies() && UnscheduledDeps == 0 &&

               "unexpected scheduled state");

      }

    }


    /// Returns true if the dependency information has been calculated.

    /// Note that depenendency validity can vary between instructions within

    /// a single bundle.

    bool hasValidDependencies() const {

      return Dependencies != ScheduleData::InvalidDeps;

    }


    /// Returns true if it is ready for scheduling, i.e. it has no more

    /// unscheduled depending instructions/bundles.

    bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }


    /// Modifies the number of unscheduled dependencies for this instruction,

    /// and returns the number of remaining dependencies for the containing

    /// bundle.

    int incrementUnscheduledDeps(int Incr) {

      assert(hasValidDependencies() &&

             "increment of unscheduled deps would be meaningless");

      UnscheduledDeps += Incr;

      assert(UnscheduledDeps >= 0 && "invariant");

      return UnscheduledDeps;

    }


    /// Sets the number of unscheduled dependencies to the number of

    /// dependencies.

    void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }


    /// Gets the number of unscheduled dependencies.

    int getUnscheduledDeps() const { return UnscheduledDeps; }

    /// Gets the number of dependencies.

    int getDependencies() const { return Dependencies; }

    /// Initializes the number of dependencies.

    void initDependencies() { Dependencies = 0; }

    /// Increments the number of dependencies.

    void incDependencies() { Dependencies++; }


    /// Gets scheduling region ID.

    int getSchedulingRegionID() const { return SchedulingRegionID; }


    /// Gets the instruction.

    Instruction *getInst() const { return Inst; }


    /// Clears all dependency information.

    void clearDependencies() {

      Dependencies = ScheduleData::InvalidDeps;

      UnscheduledDeps = ScheduleData::InvalidDeps;

      IsScheduled = false;

    }


    /// Gets the edge information.

    const EdgeInfo &getEdgeInfo() const { return EI; }


    /// Gets the bundle.

    ScheduleBundle &getBundle() { return Bundle; }

    const ScheduleBundle &getBundle() const { return Bundle; }


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

    void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }


    LLVM_DUMP_METHOD void dump() const {

      dump(dbgs());

      dbgs() << '\n';

    }

#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)


  private:

    /// true, if it has valid dependency information. These nodes always have

    /// only single dependency.

    int Dependencies = ScheduleData::InvalidDeps;


    /// The number of dependencies minus the number of dependencies of scheduled

    /// instructions. As soon as this is zero, the instruction/bundle gets ready

    /// for scheduling.

    /// Note that this is negative as long as Dependencies is not calculated.

    int UnscheduledDeps = ScheduleData::InvalidDeps;

  };


#ifndef NDEBUG

  friend inline raw_ostream &

  operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {

    SD.dump(OS);

    return OS;

  }

#endif


  friend struct GraphTraits<BoUpSLP *>;

  friend struct DOTGraphTraits<BoUpSLP *>;


  /// Contains all scheduling data for a basic block.

  /// It does not schedules instructions, which are not memory read/write

  /// instructions and their operands are either constants, or arguments, or

  /// phis, or instructions from others blocks, or their users are phis or from

  /// the other blocks. The resulting vector instructions can be placed at the

  /// beginning of the basic block without scheduling (if operands does not need

  /// to be scheduled) or at the end of the block (if users are outside of the

  /// block). It allows to save some compile time and memory used by the

  /// compiler.

  /// ScheduleData is assigned for each instruction in between the boundaries of

  /// the tree entry, even for those, which are not part of the graph. It is

  /// required to correctly follow the dependencies between the instructions and

  /// their correct scheduling. The ScheduleData is not allocated for the

  /// instructions, which do not require scheduling, like phis, nodes with

  /// extractelements/insertelements only or nodes with instructions, with

  /// uses/operands outside of the block.

  struct BlockScheduling {

    BlockScheduling(BasicBlock *BB)

        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}


    void clear() {

      ScheduledBundles.clear();

      ScheduledBundlesList.clear();

      ScheduleCopyableDataMap.clear();

      ScheduleCopyableDataMapByInst.clear();

      ScheduleCopyableDataMapByInstUser.clear();

      ScheduleCopyableDataMapByUsers.clear();

      ReadyInsts.clear();

      ScheduleStart = nullptr;

      ScheduleEnd = nullptr;

      FirstLoadStoreInRegion = nullptr;

      LastLoadStoreInRegion = nullptr;

      RegionHasStackSave = false;


      // Reduce the maximum schedule region size by the size of the

      // previous scheduling run.

      ScheduleRegionSizeLimit -= ScheduleRegionSize;

      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)

        ScheduleRegionSizeLimit = MinScheduleRegionSize;

      ScheduleRegionSize = 0;


      // Make a new scheduling region, i.e. all existing ScheduleData is not

      // in the new region yet.

      ++SchedulingRegionID;

    }


    ScheduleData *getScheduleData(Instruction *I) {

      if (!I)

        return nullptr;

      if (BB != I->getParent())

        // Avoid lookup if can't possibly be in map.

        return nullptr;

      ScheduleData *SD = ScheduleDataMap.lookup(I);

      if (SD && isInSchedulingRegion(*SD))

        return SD;

      return nullptr;

    }


    ScheduleData *getScheduleData(Value *V) {

      return getScheduleData(dyn_cast<Instruction>(V));

    }


    /// Returns the ScheduleCopyableData for the given edge (user tree entry and

    /// operand number) and value.

    ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,

                                                  const Value *V) const {

      if (ScheduleCopyableDataMap.empty())

        return nullptr;

      auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));

      if (It == ScheduleCopyableDataMap.end())

        return nullptr;

      ScheduleCopyableData *SD = It->getSecond().get();

      if (!isInSchedulingRegion(*SD))

        return nullptr;

      return SD;

    }


    /// Returns the ScheduleCopyableData for the given user \p User, operand

    /// number and operand \p V.

    SmallVector<ScheduleCopyableData *>

    getScheduleCopyableData(const Value *User, unsigned OperandIdx,

                            const Value *V) {

      if (ScheduleCopyableDataMapByInstUser.empty())

        return {};

      const auto It = ScheduleCopyableDataMapByInstUser.find(

          std::make_pair(std::make_pair(User, OperandIdx), V));

      if (It == ScheduleCopyableDataMapByInstUser.end())

        return {};

      SmallVector<ScheduleCopyableData *> Res;

      for (ScheduleCopyableData *SD : It->getSecond()) {

        if (isInSchedulingRegion(*SD))

          Res.push_back(SD);

      }

      return Res;

    }


    /// Returns true if all operands of the given instruction \p User are

    /// replaced by copyable data.

    /// \param User The user instruction.

    /// \param Op The operand, which might be replaced by the copyable data.

    /// \param SLP The SLP tree.

    /// \param NumOps The number of operands used. If the instruction uses the

    /// same operand several times, check for the first use, then the second,

    /// etc.

    bool areAllOperandsReplacedByCopyableData(Instruction *User,

                                              Instruction *Op, BoUpSLP &SLP,

                                              unsigned NumOps) const {

      assert(NumOps > 0 && "No operands");

      if (ScheduleCopyableDataMap.empty())

        return false;

      SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;

      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;

      for (const Use &U : User->operands()) {

        if (U.get() != Op)

          continue;

        ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);

        if (Entries.empty())

          return false;

        // Check all tree entries, if they have operands replaced by copyable

        // data.

        for (TreeEntry *TE : Entries) {

          // Check if the user is commutative.

          // The commutatives are handled later, as their oeprands can be

          // reordered.

          // Same applies even for non-commutative cmps, because we can invert

          // their predicate potentially and, thus, reorder the operands.

          bool IsCommutativeUser =

              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);

          EdgeInfo EI(TE, U.getOperandNo());

          if (!IsCommutativeUser && !isa<CmpInst>(User)) {

            unsigned &OpCnt =

                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();

            if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)

              return false;

            // Found copyable operand - continue.

            ++OpCnt;

            continue;

          }

          ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)

                .first->getSecond();

        }

      }

      // Check the commutative/cmp entries.

      if (!PotentiallyReorderedEntriesCount.empty()) {

        for (auto &P : PotentiallyReorderedEntriesCount) {

          auto *It = find(P.first->Scalars, User);

          assert(It != P.first->Scalars.end() &&

                 "User is not in the tree entry");

          int Lane = std::distance(P.first->Scalars.begin(), It);

          assert(Lane >= 0 && "Lane is not found");

          if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())

            Lane = P.first->ReorderIndices[Lane];

          assert(Lane < static_cast<int>(P.first->Scalars.size()) &&

                 "Couldn't find extract lane");

          SmallVector<unsigned> OpIndices;

          for (unsigned OpIdx :

               seq<unsigned>(::getNumberOfPotentiallyCommutativeOps(

                   P.first->getMainOp()))) {

            if (P.first->getOperand(OpIdx)[Lane] == Op &&

                getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))

              --P.getSecond();

          }

        }

        return all_of(PotentiallyReorderedEntriesCount,

                      [&](const std::pair<const TreeEntry *, unsigned> &P) {

                        return P.second == NumOps - 1;

                      });

      }

      return true;

    }


    SmallVector<ScheduleCopyableData *>

    getScheduleCopyableData(const Instruction *I) const {

      if (ScheduleCopyableDataMapByInst.empty())

        return {};

      const auto It = ScheduleCopyableDataMapByInst.find(I);

      if (It == ScheduleCopyableDataMapByInst.end())

        return {};

      SmallVector<ScheduleCopyableData *> Res;

      for (ScheduleCopyableData *SD : It->getSecond()) {

        if (isInSchedulingRegion(*SD))

          Res.push_back(SD);

      }

      return Res;

    }


    SmallVector<ScheduleCopyableData *>

    getScheduleCopyableDataUsers(const Instruction *User) const {

      if (ScheduleCopyableDataMapByUsers.empty())

        return {};

      const auto It = ScheduleCopyableDataMapByUsers.find(User);

      if (It == ScheduleCopyableDataMapByUsers.end())

        return {};

      SmallVector<ScheduleCopyableData *> Res;

      for (ScheduleCopyableData *SD : It->getSecond()) {

        if (isInSchedulingRegion(*SD))

          Res.push_back(SD);

      }

      return Res;

    }


    ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,

                                                  Instruction *I,

                                                  int SchedulingRegionID,

                                                  ScheduleBundle &Bundle) {

      assert(!getScheduleCopyableData(EI, I) && "already in the map");

      ScheduleCopyableData *CD =

          ScheduleCopyableDataMap

              .try_emplace(std::make_pair(EI, I),

                           std::make_unique<ScheduleCopyableData>(

                               SchedulingRegionID, I, EI, Bundle))

              .first->getSecond()

              .get();

      ScheduleCopyableDataMapByInst[I].push_back(CD);

      if (EI.UserTE) {

        ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);

        const auto *It = find(Op, I);

        assert(It != Op.end() && "Lane not set");

        SmallPtrSet<Instruction *, 4> Visited;

        do {

          int Lane = std::distance(Op.begin(), It);

          assert(Lane >= 0 && "Lane not set");

          if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&

              !EI.UserTE->ReorderIndices.empty())

            Lane = EI.UserTE->ReorderIndices[Lane];

          assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&

                 "Couldn't find extract lane");

          auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);

          if (!Visited.insert(In).second) {

            It = find(make_range(std::next(It), Op.end()), I);

            continue;

          }

          ScheduleCopyableDataMapByInstUser

              .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))

              .first->getSecond()

              .push_back(CD);

          ScheduleCopyableDataMapByUsers.try_emplace(I)

              .first->getSecond()

              .insert(CD);

          // Remove extra deps for users, becoming non-immediate users of the

          // instruction. It may happen, if the chain of same copyable elements

          // appears in the tree.

          if (In == I) {

            EdgeInfo UserEI = EI.UserTE->UserTreeIndex;

            if (ScheduleCopyableData *UserCD =

                    getScheduleCopyableData(UserEI, In))

              ScheduleCopyableDataMapByUsers[I].remove(UserCD);

          }

          It = find(make_range(std::next(It), Op.end()), I);

        } while (It != Op.end());

      } else {

        ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(

            CD);

      }

      return *CD;

    }


    ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {

      auto *I = dyn_cast<Instruction>(V);

      if (!I)

        return {};

      auto It = ScheduledBundles.find(I);

      if (It == ScheduledBundles.end())

        return {};

      return It->getSecond();

    }


    /// Returns true if the entity is in the scheduling region.

    bool isInSchedulingRegion(const ScheduleEntity &SD) const {

      if (const auto *Data = dyn_cast<ScheduleData>(&SD))

        return Data->getSchedulingRegionID() == SchedulingRegionID;

      if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))

        return CD->getSchedulingRegionID() == SchedulingRegionID;

      return all_of(cast<ScheduleBundle>(SD).getBundle(),

                    [&](const ScheduleEntity *BundleMember) {

                      return isInSchedulingRegion(*BundleMember);

                    });

    }


    /// Marks an instruction as scheduled and puts all dependent ready

    /// instructions into the ready-list.

    template <typename ReadyListType>

    void schedule(const BoUpSLP &R, const InstructionsState &S,

                  const EdgeInfo &EI, ScheduleEntity *Data,

                  ReadyListType &ReadyList) {

      auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,

                                     ArrayRef<ScheduleBundle *> Bundles) {

        // Handle the def-use chain dependencies.


        // Decrement the unscheduled counter and insert to ready list if ready.

        auto DecrUnsched = [&](auto *Data, bool IsControl = false) {

          if ((IsControl || Data->hasValidDependencies()) &&

              Data->incrementUnscheduledDeps(-1) == 0) {

            // There are no more unscheduled dependencies after

            // decrementing, so we can put the dependent instruction

            // into the ready list.

            SmallVector<ScheduleBundle *, 1> CopyableBundle;

            ArrayRef<ScheduleBundle *> Bundles;

            if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {

              CopyableBundle.push_back(&CD->getBundle());

              Bundles = CopyableBundle;

            } else {

              Bundles = getScheduleBundles(Data->getInst());

            }

            if (!Bundles.empty()) {

              for (ScheduleBundle *Bundle : Bundles) {

                if (Bundle->unscheduledDepsInBundle() == 0) {

                  assert(!Bundle->isScheduled() &&

                         "already scheduled bundle gets ready");

                  ReadyList.insert(Bundle);

                  LLVM_DEBUG(dbgs()

                             << "SLP:    gets ready: " << *Bundle << "\n");

                }

              }

              return;

            }

            assert(!Data->isScheduled() &&

                   "already scheduled bundle gets ready");

            assert(!isa<ScheduleCopyableData>(Data) &&

                   "Expected non-copyable data");

            ReadyList.insert(Data);

            LLVM_DEBUG(dbgs() << "SLP:    gets ready: " << *Data << "\n");

          }

        };


        auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,

                                      Instruction *I) {

          if (!ScheduleCopyableDataMap.empty()) {

            SmallVector<ScheduleCopyableData *> CopyableData =

                getScheduleCopyableData(User, OpIdx, I);

            for (ScheduleCopyableData *CD : CopyableData)

              DecrUnsched(CD, /*IsControl=*/false);

            if (!CopyableData.empty())

              return;

          }

          if (ScheduleData *OpSD = getScheduleData(I))

            DecrUnsched(OpSD, /*IsControl=*/false);

        };


        // If BundleMember is a vector bundle, its operands may have been

        // reordered during buildTree(). We therefore need to get its operands

        // through the TreeEntry.

        if (!Bundles.empty()) {

          auto *In = BundleMember->getInst();

          // Count uses of each instruction operand.

          SmallDenseMap<const Instruction *, unsigned> OperandsUses;

          unsigned TotalOpCount = 0;

          if (isa<ScheduleCopyableData>(BundleMember)) {

            // Copyable data is used only once (uses itself).

            TotalOpCount = OperandsUses[In] = 1;

          } else {

            for (const Use &U : In->operands()) {

              if (auto *I = dyn_cast<Instruction>(U.get())) {

                auto Res = OperandsUses.try_emplace(I, 0);

                ++Res.first->getSecond();

                ++TotalOpCount;

              }

            }

          }

          // Decrement the unscheduled counter and insert to ready list if

          // ready.

          auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,

                                        unsigned OpIdx) {

            if (!ScheduleCopyableDataMap.empty()) {

              const EdgeInfo EI = {UserTE, OpIdx};

              if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {

                DecrUnsched(CD, /*IsControl=*/false);

                return;

              }

            }

            auto It = OperandsUses.find(I);

            assert(It != OperandsUses.end() && "Operand not found");

            if (It->second > 0) {

              --It->getSecond();

              assert(TotalOpCount > 0 && "No more operands to decrement");

              --TotalOpCount;

              if (ScheduleData *OpSD = getScheduleData(I))

                DecrUnsched(OpSD, /*IsControl=*/false);

            }

          };


          for (ScheduleBundle *Bundle : Bundles) {

            if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)

              break;

            // Need to search for the lane since the tree entry can be

            // reordered.

            int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),

                                     find(Bundle->getTreeEntry()->Scalars, In));

            assert(Lane >= 0 && "Lane not set");

            if (isa<StoreInst>(In) &&

                !Bundle->getTreeEntry()->ReorderIndices.empty())

              Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];

            assert(Lane < static_cast<int>(

                              Bundle->getTreeEntry()->Scalars.size()) &&

                   "Couldn't find extract lane");


            // Since vectorization tree is being built recursively this

            // assertion ensures that the tree entry has all operands set before

            // reaching this code. Couple of exceptions known at the moment are

            // extracts where their second (immediate) operand is not added.

            // Since immediates do not affect scheduler behavior this is

            // considered okay.

            assert(In &&

                   (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||

                    In->getNumOperands() ==

                        Bundle->getTreeEntry()->getNumOperands() ||

                    Bundle->getTreeEntry()->isCopyableElement(In)) &&

                   "Missed TreeEntry operands?");


            for (unsigned OpIdx :

                 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))

              if (auto *I = dyn_cast<Instruction>(

                      Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {

                LLVM_DEBUG(dbgs() << "SLP:   check for readiness (def): " << *I

                                  << "\n");

                DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);

              }

          }

        } else {

          // If BundleMember is a stand-alone instruction, no operand reordering

          // has taken place, so we directly access its operands.

          for (Use &U : BundleMember->getInst()->operands()) {

            if (auto *I = dyn_cast<Instruction>(U.get())) {

              LLVM_DEBUG(dbgs()

                         << "SLP:   check for readiness (def): " << *I << "\n");

              DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);

            }

          }

        }

        // Handle the memory dependencies.

        auto *SD = dyn_cast<ScheduleData>(BundleMember);

        if (!SD)

          return;

        SmallPtrSet<const ScheduleData *, 4> VisitedMemory;

        for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {

          if (!VisitedMemory.insert(MemoryDep).second)

            continue;

          // There are no more unscheduled dependencies after decrementing,

          // so we can put the dependent instruction into the ready list.

          LLVM_DEBUG(dbgs() << "SLP:   check for readiness (mem): "

                            << *MemoryDep << "\n");

          DecrUnsched(MemoryDep);

        }

        // Handle the control dependencies.

        SmallPtrSet<const ScheduleData *, 4> VisitedControl;

        for (ScheduleData *Dep : SD->getControlDependencies()) {

          if (!VisitedControl.insert(Dep).second)

            continue;

          // There are no more unscheduled dependencies after decrementing,

          // so we can put the dependent instruction into the ready list.

          LLVM_DEBUG(dbgs()

                     << "SLP:   check for readiness (ctrl): " << *Dep << "\n");

          DecrUnsched(Dep, /*IsControl=*/true);

        }

      };

      if (auto *SD = dyn_cast<ScheduleData>(Data)) {

        SD->setScheduled(/*Scheduled=*/true);

        LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");

        ProcessBundleMember(SD, {});

      } else {

        ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);

        Bundle.setScheduled(/*Scheduled=*/true);

        LLVM_DEBUG(dbgs() << "SLP:   schedule " << Bundle << "\n");

        auto AreAllBundlesScheduled =

            [&](const ScheduleEntity *SD,

                ArrayRef<ScheduleBundle *> SDBundles) {

              if (isa<ScheduleCopyableData>(SD))

                return true;

              return !SDBundles.empty() &&

                     all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {

                       return SDBundle->isScheduled();

                     });

            };

        for (ScheduleEntity *SD : Bundle.getBundle()) {

          ArrayRef<ScheduleBundle *> SDBundles;

          if (!isa<ScheduleCopyableData>(SD))

            SDBundles = getScheduleBundles(SD->getInst());

          if (AreAllBundlesScheduled(SD, SDBundles)) {

            SD->setScheduled(/*Scheduled=*/true);

            ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle

                                                                  : SDBundles);

          }

        }

      }

    }


    /// Verify basic self consistency properties of the data structure.

    void verify() {

      if (!ScheduleStart)

        return;


      assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&

             ScheduleStart->comesBefore(ScheduleEnd) &&

             "Not a valid scheduling region?");


      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

        ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);

        if (!Bundles.empty()) {

          for (ScheduleBundle *Bundle : Bundles) {

            assert(isInSchedulingRegion(*Bundle) &&

                   "primary schedule data not in window?");

            Bundle->verify();

          }

          continue;

        }

        auto *SD = getScheduleData(I);

        if (!SD)

          continue;

        assert(isInSchedulingRegion(*SD) &&

               "primary schedule data not in window?");

        SD->verify();

      }


      assert(all_of(ReadyInsts,

                    [](const ScheduleEntity *Bundle) {

                      return Bundle->isReady();

                    }) &&

             "item in ready list not ready?");

    }


    /// Put all instructions into the ReadyList which are ready for scheduling.

    template <typename ReadyListType>

    void initialFillReadyList(ReadyListType &ReadyList) {

      SmallPtrSet<ScheduleBundle *, 16> Visited;

      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

        ScheduleData *SD = getScheduleData(I);

        if (SD && SD->hasValidDependencies() && SD->isReady()) {

          if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);

              !Bundles.empty()) {

            for (ScheduleBundle *Bundle : Bundles) {

              if (!Visited.insert(Bundle).second)

                continue;

              if (Bundle->hasValidDependencies() && Bundle->isReady()) {

                ReadyList.insert(Bundle);

                LLVM_DEBUG(dbgs() << "SLP:    initially in ready list: "

                                  << *Bundle << "\n");

              }

            }

            continue;

          }

          ReadyList.insert(SD);

          LLVM_DEBUG(dbgs()

                     << "SLP:    initially in ready list: " << *SD << "\n");

        }

      }

    }


    /// Build a bundle from the ScheduleData nodes corresponding to the

    /// scalar instruction for each lane.

    /// \param VL The list of scalar instructions.

    /// \param S The state of the instructions.

    /// \param EI The edge in the SLP graph or the user node/operand number.

    ScheduleBundle &buildBundle(ArrayRef<Value *> VL,

                                const InstructionsState &S, const EdgeInfo &EI);


    /// Checks if a bundle of instructions can be scheduled, i.e. has no

    /// cyclic dependencies. This is only a dry-run, no instructions are

    /// actually moved at this stage.

    /// \returns the scheduling bundle. The returned Optional value is not

    /// std::nullopt if \p VL is allowed to be scheduled.

    std::optional<ScheduleBundle *>

    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

                      const InstructionsState &S, const EdgeInfo &EI);


    /// Allocates schedule data chunk.

    ScheduleData *allocateScheduleDataChunks();


    /// Extends the scheduling region so that V is inside the region.

    /// \returns true if the region size is within the limit.

    bool extendSchedulingRegion(Value *V, const InstructionsState &S);


    /// Initialize the ScheduleData structures for new instructions in the

    /// scheduling region.

    void initScheduleData(Instruction *FromI, Instruction *ToI,

                          ScheduleData *PrevLoadStore,

                          ScheduleData *NextLoadStore);


    /// Updates the dependency information of a bundle and of all instructions/

    /// bundles which depend on the original bundle.

    void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,

                               BoUpSLP *SLP,

                               ArrayRef<ScheduleData *> ControlDeps = {});


    /// Sets all instruction in the scheduling region to un-scheduled.

    void resetSchedule();


    BasicBlock *BB;


    /// Simple memory allocation for ScheduleData.

    SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;


    /// The size of a ScheduleData array in ScheduleDataChunks.

    int ChunkSize;


    /// The allocator position in the current chunk, which is the last entry

    /// of ScheduleDataChunks.

    int ChunkPos;


    /// Attaches ScheduleData to Instruction.

    /// Note that the mapping survives during all vectorization iterations, i.e.

    /// ScheduleData structures are recycled.

    SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;


    /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand

    /// number) and the operand instruction, represented as copyable element.

    SmallDenseMap<std::pair<EdgeInfo, const Value *>,

                  std::unique_ptr<ScheduleCopyableData>>

        ScheduleCopyableDataMap;


    /// Represents mapping between instruction and all related

    /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable

    /// element). The SLP tree may contain several representations of the same

    /// instruction.

    SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>

        ScheduleCopyableDataMapByInst;


    /// Represents mapping between user value and operand number, the operand

    /// value and all related ScheduleCopyableData. The relation is 1:n, because

    /// the same user may refernce the same operand in different tree entries

    /// and the operand may be modelled by the different copyable data element.

    SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,

                  SmallVector<ScheduleCopyableData *>>

        ScheduleCopyableDataMapByInstUser;


    /// Represents mapping between instruction and all related

    /// ScheduleCopyableData. It represents the mapping between the actual

    /// instruction and the last copyable data element in the chain. E.g., if

    /// the graph models the following instructions:

    /// %0 = non-add instruction ...

    /// ...

    /// %4 = add %3, 1

    /// %5 = add %4, 1

    /// %6 = insertelement poison, %0, 0

    /// %7 = insertelement %6, %5, 1

    /// And the graph is modeled as:

    /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]

    ///          -> [1, 0]                 -> [%1, 0]

    ///

    /// this map will map %0 only to the copyable element <1>, which is the last

    /// user (direct user of the actual instruction). <0> uses <1>, so <1> will

    /// keep the map to <0>, not the %0.

    SmallDenseMap<const Instruction *,

                  SmallSetVector<ScheduleCopyableData *, 4>>

        ScheduleCopyableDataMapByUsers;


    /// Attaches ScheduleBundle to Instruction.

    SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>

        ScheduledBundles;

    /// The list of ScheduleBundles.

    SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;


    /// The ready-list for scheduling (only used for the dry-run).

    SetVector<ScheduleEntity *> ReadyInsts;


    /// The first instruction of the scheduling region.

    Instruction *ScheduleStart = nullptr;


    /// The first instruction _after_ the scheduling region.

    Instruction *ScheduleEnd = nullptr;


    /// The first memory accessing instruction in the scheduling region

    /// (can be null).

    ScheduleData *FirstLoadStoreInRegion = nullptr;


    /// The last memory accessing instruction in the scheduling region

    /// (can be null).

    ScheduleData *LastLoadStoreInRegion = nullptr;


    /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling

    /// region?  Used to optimize the dependence calculation for the

    /// common case where there isn't.

    bool RegionHasStackSave = false;


    /// The current size of the scheduling region.

    int ScheduleRegionSize = 0;


    /// The maximum size allowed for the scheduling region.

    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;


    /// The ID of the scheduling region. For a new vectorization iteration this

    /// is incremented which "removes" all ScheduleData from the region.

    /// Make sure that the initial SchedulingRegionID is greater than the

    /// initial SchedulingRegionID in ScheduleData (which is 0).

    int SchedulingRegionID = 1;

  };


  /// Attaches the BlockScheduling structures to basic blocks.

  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;


  /// Performs the "real" scheduling. Done before vectorization is actually

  /// performed in a basic block.

  void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);


  /// List of users to ignore during scheduling and that don't need extracting.

  const SmallDenseSet<Value *> *UserIgnoreList = nullptr;


  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of

  /// sorted SmallVectors of unsigned.

  struct OrdersTypeDenseMapInfo {

    static OrdersType getEmptyKey() {

      OrdersType V;

      V.push_back(~1U);

      return V;

    }


    static OrdersType getTombstoneKey() {

      OrdersType V;

      V.push_back(~2U);

      return V;

    }


    static unsigned getHashValue(const OrdersType &V) {

      return static_cast<unsigned>(hash_combine_range(V));

    }


    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {

      return LHS == RHS;

    }

  };


  // Analysis and block reference.

  Function *F;

  ScalarEvolution *SE;

  TargetTransformInfo *TTI;

  TargetLibraryInfo *TLI;

  LoopInfo *LI;

  DominatorTree *DT;

  AssumptionCache *AC;

  DemandedBits *DB;

  const DataLayout *DL;

  OptimizationRemarkEmitter *ORE;


  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.

  unsigned MinVecRegSize; // Set by cl::opt (default: 128).


  /// Instruction builder to construct the vectorized tree.

  IRBuilder<TargetFolder> Builder;


  /// A map of scalar integer values to the smallest bit width with which they

  /// can legally be represented. The values map to (width, signed) pairs,

  /// where "width" indicates the minimum bit width and "signed" is True if the

  /// value must be signed-extended, rather than zero-extended, back to its

  /// original width.

  DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;


  /// Final size of the reduced vector, if the current graph represents the

  /// input for the reduction and it was possible to narrow the size of the

  /// reduction.

  unsigned ReductionBitWidth = 0;


  /// Canonical graph size before the transformations.

  unsigned BaseGraphSize = 1;


  /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of

  /// type sizes, used in the tree.

  std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;


  /// Indices of the vectorized nodes, which supposed to be the roots of the new

  /// bitwidth analysis attempt, like trunc, IToFP or ICmp.

  DenseSet<unsigned> ExtraBitWidthNodes;

};


} // end namespace slpvectorizer


template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {

  using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;

  using SecondInfo = DenseMapInfo<unsigned>;

  static BoUpSLP::EdgeInfo getEmptyKey() {

    return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),

                             SecondInfo::getEmptyKey());

  }


  static BoUpSLP::EdgeInfo getTombstoneKey() {

    return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),

                             SecondInfo::getTombstoneKey());

  }


  static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {

    return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),

                                    SecondInfo::getHashValue(Val.EdgeIdx));

  }


  static bool isEqual(const BoUpSLP::EdgeInfo &LHS,

                      const BoUpSLP::EdgeInfo &RHS) {

    return LHS == RHS;

  }

};


template <> struct GraphTraits<BoUpSLP *> {

  using TreeEntry = BoUpSLP::TreeEntry;


  /// NodeRef has to be a pointer per the GraphWriter.

  using NodeRef = TreeEntry *;


  using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;


  /// Add the VectorizableTree to the index iterator to be able to return

  /// TreeEntry pointers.

  struct ChildIteratorType

      : public iterator_adaptor_base<

            ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {

    ContainerTy &VectorizableTree;


    ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,

                      ContainerTy &VT)

        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}


    NodeRef operator*() { return I->UserTE; }

  };


  static NodeRef getEntryNode(BoUpSLP &R) {

    return R.VectorizableTree[0].get();

  }


  static ChildIteratorType child_begin(NodeRef N) {

    return {&N->UserTreeIndex, N->Container};

  }


  static ChildIteratorType child_end(NodeRef N) {

    return {&N->UserTreeIndex + 1, N->Container};

  }


  /// For the node iterator we just need to turn the TreeEntry iterator into a

  /// TreeEntry* iterator so that it dereferences to NodeRef.

  class nodes_iterator {

    using ItTy = ContainerTy::iterator;

    ItTy It;


  public:

    nodes_iterator(const ItTy &It2) : It(It2) {}

    NodeRef operator*() { return It->get(); }

    nodes_iterator operator++() {

      ++It;

      return *this;

    }

    bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }

  };


  static nodes_iterator nodes_begin(BoUpSLP *R) {

    return nodes_iterator(R->VectorizableTree.begin());

  }


  static nodes_iterator nodes_end(BoUpSLP *R) {

    return nodes_iterator(R->VectorizableTree.end());

  }


  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }

};


template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

  using TreeEntry = BoUpSLP::TreeEntry;


  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}


  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {

    std::string Str;

    raw_string_ostream OS(Str);

    OS << Entry->Idx << ".\n";

    if (isSplat(Entry->Scalars))

      OS << "<splat> ";

    for (auto *V : Entry->Scalars) {

      OS << *V;

      if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {

            return EU.Scalar == V;

          }))

        OS << " <extract>";

      OS << "\n";

    }

    return Str;

  }


  static std::string getNodeAttributes(const TreeEntry *Entry,

                                       const BoUpSLP *) {

    if (Entry->isGather())

      return "color=red";

    if (Entry->State == TreeEntry::ScatterVectorize ||

        Entry->State == TreeEntry::StridedVectorize ||

        Entry->State == TreeEntry::CompressVectorize)

      return "color=blue";

    return "";

  }

};


} // end namespace llvm


BoUpSLP::~BoUpSLP() {

  SmallVector<WeakTrackingVH> DeadInsts;

  for (auto *I : DeletedInstructions) {

    if (!I->getParent()) {

      // Temporarily insert instruction back to erase them from parent and

      // memory later.

      if (isa<PHINode>(I))

        // Phi nodes must be the very first instructions in the block.

        I->insertBefore(F->getEntryBlock(),

                        F->getEntryBlock().getFirstNonPHIIt());

      else

        I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());

      continue;

    }

    for (Use &U : I->operands()) {

      auto *Op = dyn_cast<Instruction>(U.get());

      if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&

          wouldInstructionBeTriviallyDead(Op, TLI))

        DeadInsts.emplace_back(Op);

    }

    I->dropAllReferences();

  }

  for (auto *I : DeletedInstructions) {

    assert(I->use_empty() &&

           "trying to erase instruction with users.");

    I->eraseFromParent();

  }


  // Cleanup any dead scalar code feeding the vectorized instructions

  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);


#ifdef EXPENSIVE_CHECKS

  // If we could guarantee that this call is not extremely slow, we could

  // remove the ifdef limitation (see PR47712).

  assert(!verifyFunction(*F, &dbgs()));

#endif

}


/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses

/// contains original mask for the scalars reused in the node. Procedure

/// transform this mask in accordance with the given \p Mask.

static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {

  assert(!Mask.empty() && Reuses.size() == Mask.size() &&

         "Expected non-empty mask.");

  SmallVector<int> Prev(Reuses.begin(), Reuses.end());

  Prev.swap(Reuses);

  for (unsigned I = 0, E = Prev.size(); I < E; ++I)

    if (Mask[I] != PoisonMaskElem)

      Reuses[Mask[I]] = Prev[I];

}


/// Reorders the given \p Order according to the given \p Mask. \p Order - is

/// the original order of the scalars. Procedure transforms the provided order

/// in accordance with the given \p Mask. If the resulting \p Order is just an

/// identity order, \p Order is cleared.

static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,

                         bool BottomOrder = false) {

  assert(!Mask.empty() && "Expected non-empty mask.");

  unsigned Sz = Mask.size();

  if (BottomOrder) {

    SmallVector<unsigned> PrevOrder;

    if (Order.empty()) {

      PrevOrder.resize(Sz);

      std::iota(PrevOrder.begin(), PrevOrder.end(), 0);

    } else {

      PrevOrder.swap(Order);

    }

    Order.assign(Sz, Sz);

    for (unsigned I = 0; I < Sz; ++I)

      if (Mask[I] != PoisonMaskElem)

        Order[I] = PrevOrder[Mask[I]];

    if (all_of(enumerate(Order), [&](const auto &Data) {

          return Data.value() == Sz || Data.index() == Data.value();

        })) {

      Order.clear();

      return;

    }

    fixupOrderingIndices(Order);

    return;

  }

  SmallVector<int> MaskOrder;

  if (Order.empty()) {

    MaskOrder.resize(Sz);

    std::iota(MaskOrder.begin(), MaskOrder.end(), 0);

  } else {

    inversePermutation(Order, MaskOrder);

  }

  reorderReuses(MaskOrder, Mask);

  if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {

    Order.clear();

    return;

  }

  Order.assign(Sz, Sz);

  for (unsigned I = 0; I < Sz; ++I)

    if (MaskOrder[I] != PoisonMaskElem)

      Order[MaskOrder[I]] = I;

  fixupOrderingIndices(Order);

}


std::optional<BoUpSLP::OrdersType>

BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,

                                  bool TopToBottom, bool IgnoreReorder) {

  assert(TE.isGather() && "Expected gather node only.");

  // Try to find subvector extract/insert patterns and reorder only such

  // patterns.

  SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());

  Type *ScalarTy = GatheredScalars.front()->getType();

  size_t NumScalars = GatheredScalars.size();

  if (!isValidElementType(ScalarTy))

    return std::nullopt;

  auto *VecTy = getWidenedType(ScalarTy, NumScalars);

  unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);

  SmallVector<int> ExtractMask;

  SmallVector<int> Mask;

  SmallVector<SmallVector<const TreeEntry *>> Entries;

  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =

      tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =

      isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,

                            /*ForOrder=*/true);

  // No shuffled operands - ignore.

  if (GatherShuffles.empty() && ExtractShuffles.empty())

    return std::nullopt;

  OrdersType CurrentOrder(NumScalars, NumScalars);

  if (GatherShuffles.size() == 1 &&

      *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&

      Entries.front().front()->isSame(TE.Scalars)) {

    // If the full matched node in whole tree rotation - no need to consider the

    // matching order, rotating the whole tree.

    if (TopToBottom)

      return std::nullopt;

    // No need to keep the order for the same user node.

    if (Entries.front().front()->UserTreeIndex.UserTE ==

        TE.UserTreeIndex.UserTE)

      return std::nullopt;

    // No need to keep the order for the matched root node, if it can be freely

    // reordered.

    if (!IgnoreReorder && Entries.front().front()->Idx == 0)

      return std::nullopt;

    // If shuffling 2 elements only and the matching node has reverse reuses -

    // no need to count order, both work fine.

    if (!Entries.front().front()->ReuseShuffleIndices.empty() &&

        TE.getVectorFactor() == 2 && Mask.size() == 2 &&

        any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),

               [](const auto &P) {

                 return P.value() % 2 != static_cast<int>(P.index()) % 2;

               }))

      return std::nullopt;


    // Perfect match in the graph, will reuse the previously vectorized

    // node. Cost is 0.

    std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);

    return CurrentOrder;

  }

  auto IsSplatMask = [](ArrayRef<int> Mask) {

    int SingleElt = PoisonMaskElem;

    return all_of(Mask, [&](int I) {

      if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)

        SingleElt = I;

      return I == PoisonMaskElem || I == SingleElt;

    });

  };

  // Exclusive broadcast mask - ignore.

  if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&

       (Entries.size() != 1 ||

        Entries.front().front()->ReorderIndices.empty())) ||

      (GatherShuffles.empty() && IsSplatMask(ExtractMask)))

    return std::nullopt;

  SmallBitVector ShuffledSubMasks(NumParts);

  auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,

                                  ArrayRef<int> Mask, int PartSz, int NumParts,

                                  function_ref<unsigned(unsigned)> GetVF) {

    for (int I : seq<int>(0, NumParts)) {

      if (ShuffledSubMasks.test(I))

        continue;

      const int VF = GetVF(I);

      if (VF == 0)

        continue;

      unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);

      MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);

      // Shuffle of at least 2 vectors - ignore.

      if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {

        llvm::fill(Slice, NumScalars);

        ShuffledSubMasks.set(I);

        continue;

      }

      // Try to include as much elements from the mask as possible.

      int FirstMin = INT_MAX;

      int SecondVecFound = false;

      for (int K : seq<int>(Limit)) {

        int Idx = Mask[I * PartSz + K];

        if (Idx == PoisonMaskElem) {

          Value *V = GatheredScalars[I * PartSz + K];

          if (isConstant(V) && !isa<PoisonValue>(V)) {

            SecondVecFound = true;

            break;

          }

          continue;

        }

        if (Idx < VF) {

          if (FirstMin > Idx)

            FirstMin = Idx;

        } else {

          SecondVecFound = true;

          break;

        }

      }

      FirstMin = (FirstMin / PartSz) * PartSz;

      // Shuffle of at least 2 vectors - ignore.

      if (SecondVecFound) {

        llvm::fill(Slice, NumScalars);

        ShuffledSubMasks.set(I);

        continue;

      }

      for (int K : seq<int>(Limit)) {

        int Idx = Mask[I * PartSz + K];

        if (Idx == PoisonMaskElem)

          continue;

        Idx -= FirstMin;

        if (Idx >= PartSz) {

          SecondVecFound = true;

          break;

        }

        if (CurrentOrder[I * PartSz + Idx] >

                static_cast<unsigned>(I * PartSz + K) &&

            CurrentOrder[I * PartSz + Idx] !=

                static_cast<unsigned>(I * PartSz + Idx))

          CurrentOrder[I * PartSz + Idx] = I * PartSz + K;

      }

      // Shuffle of at least 2 vectors - ignore.

      if (SecondVecFound) {

        llvm::fill(Slice, NumScalars);

        ShuffledSubMasks.set(I);

        continue;

      }

    }

  };

  int PartSz = getPartNumElems(NumScalars, NumParts);

  if (!ExtractShuffles.empty())

    TransformMaskToOrder(

        CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {

          if (!ExtractShuffles[I])

            return 0U;

          unsigned VF = 0;

          unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);

          for (unsigned Idx : seq<unsigned>(Sz)) {

            int K = I * PartSz + Idx;

            if (ExtractMask[K] == PoisonMaskElem)

              continue;

            if (!TE.ReuseShuffleIndices.empty())

              K = TE.ReuseShuffleIndices[K];

            if (K == PoisonMaskElem)

              continue;

            if (!TE.ReorderIndices.empty())

              K = std::distance(TE.ReorderIndices.begin(),

                                find(TE.ReorderIndices, K));

            auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);

            if (!EI)

              continue;

            VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())

                                  ->getElementCount()

                                  .getKnownMinValue());

          }

          return VF;

        });

  // Check special corner case - single shuffle of the same entry.

  if (GatherShuffles.size() == 1 && NumParts != 1) {

    if (ShuffledSubMasks.any())

      return std::nullopt;

    PartSz = NumScalars;

    NumParts = 1;

  }

  if (!Entries.empty())

    TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {

      if (!GatherShuffles[I])

        return 0U;

      return std::max(Entries[I].front()->getVectorFactor(),

                      Entries[I].back()->getVectorFactor());

    });

  unsigned NumUndefs = count(CurrentOrder, NumScalars);

  if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))

    return std::nullopt;

  return std::move(CurrentOrder);

}


static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,

                                  const TargetLibraryInfo &TLI,

                                  bool CompareOpcodes = true) {

  if (getUnderlyingObject(Ptr1, RecursionMaxDepth) !=

      getUnderlyingObject(Ptr2, RecursionMaxDepth))

    return false;

  auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);

  auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);

  return (!GEP1 || GEP1->getNumOperands() == 2) &&

         (!GEP2 || GEP2->getNumOperands() == 2) &&

         (((!GEP1 || isConstant(GEP1->getOperand(1))) &&

           (!GEP2 || isConstant(GEP2->getOperand(1)))) ||

          !CompareOpcodes ||

          (GEP1 && GEP2 &&

           getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));

}


/// Calculates minimal alignment as a common alignment.

template <typename T>

static Align computeCommonAlignment(ArrayRef<Value *> VL) {

  Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();

  for (Value *V : VL)

    CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());

  return CommonAlignment;

}


/// Check if \p Order represents reverse order.

static bool isReverseOrder(ArrayRef<unsigned> Order) {

  assert(!Order.empty() &&

         "Order is empty. Please check it before using isReverseOrder.");

  unsigned Sz = Order.size();

  return all_of(enumerate(Order), [&](const auto &Pair) {

    return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();

  });

}


/// Checks if the provided list of pointers \p Pointers represents the strided

/// pointers for type ElemTy. If they are not, std::nullopt is returned.

/// Otherwise, if \p Inst is not specified, just initialized optional value is

/// returned to show that the pointers represent strided pointers. If \p Inst

/// specified, the runtime stride is materialized before the given \p Inst.

/// \returns std::nullopt if the pointers are not pointers with the runtime

/// stride, nullptr or actual stride value, otherwise.

static std::optional<Value *>

calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,

                  const DataLayout &DL, ScalarEvolution &SE,

                  SmallVectorImpl<unsigned> &SortedIndices,

                  Instruction *Inst = nullptr) {

  SmallVector<const SCEV *> SCEVs;

  const SCEV *PtrSCEVLowest = nullptr;

  const SCEV *PtrSCEVHighest = nullptr;

  // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest

  // addresses).

  for (Value *Ptr : PointerOps) {

    const SCEV *PtrSCEV = SE.getSCEV(Ptr);

    if (!PtrSCEV)

      return std::nullopt;

    SCEVs.push_back(PtrSCEV);

    if (!PtrSCEVLowest && !PtrSCEVHighest) {

      PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;

      continue;

    }

    const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);

    if (isa<SCEVCouldNotCompute>(Diff))

      return std::nullopt;

    if (Diff->isNonConstantNegative()) {

      PtrSCEVLowest = PtrSCEV;

      continue;

    }

    const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);

    if (isa<SCEVCouldNotCompute>(Diff1))

      return std::nullopt;

    if (Diff1->isNonConstantNegative()) {

      PtrSCEVHighest = PtrSCEV;

      continue;

    }

  }

  // Dist = PtrSCEVHighest - PtrSCEVLowest;

  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);

  if (isa<SCEVCouldNotCompute>(Dist))

    return std::nullopt;

  int Size = DL.getTypeStoreSize(ElemTy);

  auto TryGetStride = [&](const SCEV *Dist,

                          const SCEV *Multiplier) -> const SCEV * {

    if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {

      if (M->getOperand(0) == Multiplier)

        return M->getOperand(1);

      if (M->getOperand(1) == Multiplier)

        return M->getOperand(0);

      return nullptr;

    }

    if (Multiplier == Dist)

      return SE.getConstant(Dist->getType(), 1);

    return SE.getUDivExactExpr(Dist, Multiplier);

  };

  // Stride_in_elements = Dist / element_size * (num_elems - 1).

  const SCEV *Stride = nullptr;

  if (Size != 1 || SCEVs.size() > 2) {

    const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));

    Stride = TryGetStride(Dist, Sz);

    if (!Stride)

      return std::nullopt;

  }

  if (!Stride || isa<SCEVConstant>(Stride))

    return std::nullopt;

  // Iterate through all pointers and check if all distances are

  // unique multiple of Stride.

  using DistOrdPair = std::pair<int64_t, int>;

  auto Compare = llvm::less_first();

  std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);

  int Cnt = 0;

  bool IsConsecutive = true;

  for (const SCEV *PtrSCEV : SCEVs) {

    unsigned Dist = 0;

    if (PtrSCEV != PtrSCEVLowest) {

      const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);

      const SCEV *Coeff = TryGetStride(Diff, Stride);

      if (!Coeff)

        return std::nullopt;

      const auto *SC = dyn_cast<SCEVConstant>(Coeff);

      if (!SC || isa<SCEVCouldNotCompute>(SC))

        return std::nullopt;

      if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,

                                                  SE.getMulExpr(Stride, SC)))

               ->isZero())

        return std::nullopt;

      Dist = SC->getAPInt().getZExtValue();

    }

    // If the strides are not the same or repeated, we can't vectorize.

    if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())

      return std::nullopt;

    auto Res = Offsets.emplace(Dist, Cnt);

    if (!Res.second)

      return std::nullopt;

    // Consecutive order if the inserted element is the last one.

    IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();

    ++Cnt;

  }

  if (Offsets.size() != SCEVs.size())

    return std::nullopt;

  SortedIndices.clear();

  if (!IsConsecutive) {

    // Fill SortedIndices array only if it is non-consecutive.

    SortedIndices.resize(PointerOps.size());

    Cnt = 0;

    for (const std::pair<int64_t, int> &Pair : Offsets) {

      SortedIndices[Cnt] = Pair.second;

      ++Cnt;

    }

  }

  if (!Inst)

    return nullptr;

  SCEVExpander Expander(SE, DL, "strided-load-vec");

  return Expander.expandCodeFor(Stride, Stride->getType(), Inst);

}


static std::pair<InstructionCost, InstructionCost>

getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,

            Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,

            Type *ScalarTy, VectorType *VecTy);


/// Returns the cost of the shuffle instructions with the given \p Kind, vector

/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert

/// subvector pattern.

static InstructionCost

getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,

               VectorType *Tp, ArrayRef<int> Mask = {},

               TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,

               int Index = 0, VectorType *SubTp = nullptr,

               ArrayRef<const Value *> Args = {}) {

  VectorType *DstTy = Tp;

  if (!Mask.empty())

    DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());


  if (Kind != TTI::SK_PermuteTwoSrc)

    return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,

                              Args);

  int NumSrcElts = Tp->getElementCount().getKnownMinValue();

  int NumSubElts;

  if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(

                             Mask, NumSrcElts, NumSubElts, Index)) {

    if (Index + NumSubElts > NumSrcElts &&

        Index + NumSrcElts <= static_cast<int>(Mask.size()))

      return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,

                                TTI::TCK_RecipThroughput, Index, Tp);

  }

  return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,

                            Args);

}


/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if

/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted

/// instead of a scalar.

static InstructionCost

getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,

                         VectorType *Ty, const APInt &DemandedElts, bool Insert,

                         bool Extract, TTI::TargetCostKind CostKind,

                         bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {

  assert(!isa<ScalableVectorType>(Ty) &&

         "ScalableVectorType is not supported.");

  assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==

             getNumElements(Ty) &&

         "Incorrect usage.");

  if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

    assert(SLPReVec && "Only supported by REVEC.");

    // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead

    // of CreateInsertElement.

    unsigned ScalarTyNumElements = VecTy->getNumElements();

    InstructionCost Cost = 0;

    for (unsigned I : seq(DemandedElts.getBitWidth())) {

      if (!DemandedElts[I])

        continue;

      if (Insert)

        Cost += getShuffleCost(TTI, TTI::SK_InsertSubvector, Ty, {}, CostKind,

                               I * ScalarTyNumElements, VecTy);

      if (Extract)

        Cost += getShuffleCost(TTI, TTI::SK_ExtractSubvector, Ty, {}, CostKind,

                               I * ScalarTyNumElements, VecTy);

    }

    return Cost;

  }

  return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,

                                      CostKind, ForPoisonSrc, VL);

}


/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy

/// is a FixedVectorType, a vector will be extracted instead of a scalar.

static InstructionCost getVectorInstrCost(

    const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,

    TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,

    ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {

  if (Opcode == Instruction::ExtractElement) {

    if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

      assert(SLPReVec && "Only supported by REVEC.");

      assert(isa<VectorType>(Val) && "Val must be a vector type.");

      return getShuffleCost(TTI, TTI::SK_ExtractSubvector,

                            cast<VectorType>(Val), {}, CostKind,

                            Index * VecTy->getNumElements(), VecTy);

    }

  }

  return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,

                                ScalarUserAndIdx);

}


/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst

/// is a FixedVectorType, a vector will be extracted instead of a scalar.

static InstructionCost getExtractWithExtendCost(

    const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,

    VectorType *VecTy, unsigned Index,

    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {

  if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {

    assert(SLPReVec && "Only supported by REVEC.");

    auto *SubTp =

        getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());

    return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind,

                          Index * ScalarTy->getNumElements(), SubTp) +

           TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,

                                CostKind);

  }

  return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);

}


/// Creates subvector insert. Generates shuffle using \p Generator or

/// using default shuffle.

static Value *createInsertVector(

    IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,

    function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {

  if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))

    return Vec;

  const unsigned SubVecVF = getNumElements(V->getType());

  // Create shuffle, insertvector requires that index is multiple of

  // the subvector length.

  const unsigned VecVF = getNumElements(Vec->getType());

  SmallVector<int> Mask(VecVF, PoisonMaskElem);

  if (isa<PoisonValue>(Vec)) {

    auto *Begin = std::next(Mask.begin(), Index);

    std::iota(Begin, std::next(Begin, SubVecVF), 0);

    Vec = Builder.CreateShuffleVector(V, Mask);

    return Vec;

  }

  std::iota(Mask.begin(), Mask.end(), 0);

  std::iota(std::next(Mask.begin(), Index),

            std::next(Mask.begin(), Index + SubVecVF), VecVF);

  if (Generator)

    return Generator(Vec, V, Mask);

  // 1. Resize V to the size of Vec.

  SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);

  std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);

  V = Builder.CreateShuffleVector(V, ResizeMask);

  // 2. Insert V into Vec.

  return Builder.CreateShuffleVector(Vec, V, Mask);

}


/// Generates subvector extract using \p Generator or using default shuffle.

static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,

                                  unsigned SubVecVF, unsigned Index) {

  SmallVector<int> Mask(SubVecVF, PoisonMaskElem);

  std::iota(Mask.begin(), Mask.end(), Index);

  return Builder.CreateShuffleVector(Vec, Mask);

}


/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered

/// with \p Order.

/// \return true if the mask represents strided access, false - otherwise.

static bool buildCompressMask(ArrayRef<Value *> PointerOps,

                              ArrayRef<unsigned> Order, Type *ScalarTy,

                              const DataLayout &DL, ScalarEvolution &SE,

                              SmallVectorImpl<int> &CompressMask) {

  const unsigned Sz = PointerOps.size();

  CompressMask.assign(Sz, PoisonMaskElem);

  // The first element always set.

  CompressMask[0] = 0;

  // Check if the mask represents strided access.

  std::optional<unsigned> Stride = 0;

  Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];

  for (unsigned I : seq<unsigned>(1, Sz)) {

    Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];

    std::optional<int64_t> OptPos =

        getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);

    if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())

      return false;

    unsigned Pos = static_cast<unsigned>(*OptPos);

    CompressMask[I] = Pos;

    if (!Stride)

      continue;

    if (*Stride == 0) {

      *Stride = Pos;

      continue;

    }

    if (Pos != *Stride * I)

      Stride.reset();

  }

  return Stride.has_value();

}


/// Checks if the \p VL can be transformed to a (masked)load + compress or

/// (masked) interleaved load.

static bool isMaskedLoadCompress(

    ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,

    ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,

    const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,

    const DominatorTree &DT, const TargetLibraryInfo &TLI,

    const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,

    unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,

    VectorType *&LoadVecTy) {

  InterleaveFactor = 0;

  Type *ScalarTy = VL.front()->getType();

  const size_t Sz = VL.size();

  auto *VecTy = getWidenedType(ScalarTy, Sz);

  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  SmallVector<int> Mask;

  if (!Order.empty())

    inversePermutation(Order, Mask);

  // Check external uses.

  for (const auto [I, V] : enumerate(VL)) {

    if (AreAllUsersVectorized(V))

      continue;

    InstructionCost ExtractCost =

        TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,

                               Mask.empty() ? I : Mask[I]);

    InstructionCost ScalarCost =

        TTI.getInstructionCost(cast<Instruction>(V), CostKind);

    if (ExtractCost <= ScalarCost)

      return false;

  }

  Value *Ptr0;

  Value *PtrN;

  if (Order.empty()) {

    Ptr0 = PointerOps.front();

    PtrN = PointerOps.back();

  } else {

    Ptr0 = PointerOps[Order.front()];

    PtrN = PointerOps[Order.back()];

  }

  std::optional<int64_t> Diff =

      getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);

  if (!Diff)

    return false;

  const size_t MaxRegSize =

      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

          .getFixedValue();

  // Check for very large distances between elements.

  if (*Diff / Sz >= MaxRegSize / 8)

    return false;

  LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);

  auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);

  Align CommonAlignment = LI->getAlign();

  IsMasked = !isSafeToLoadUnconditionally(

      Ptr0, LoadVecTy, CommonAlignment, DL,

      cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,

      &TLI);

  if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,

                                         LI->getPointerAddressSpace()))

    return false;

  // TODO: perform the analysis of each scalar load for better

  // safe-load-unconditionally analysis.

  bool IsStrided =

      buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);

  assert(CompressMask.size() >= 2 && "At least two elements are required");

  SmallVector<Value *> OrderedPointerOps(PointerOps);

  if (!Order.empty())

    reorderScalars(OrderedPointerOps, Mask);

  auto [ScalarGEPCost, VectorGEPCost] =

      getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),

                  Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);

  // The cost of scalar loads.

  InstructionCost ScalarLoadsCost =

      std::accumulate(VL.begin(), VL.end(), InstructionCost(),

                      [&](InstructionCost C, Value *V) {

                        return C + TTI.getInstructionCost(cast<Instruction>(V),

                                                          CostKind);

                      }) +

      ScalarGEPCost;

  APInt DemandedElts = APInt::getAllOnes(Sz);

  InstructionCost GatherCost =

      getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,

                               /*Insert=*/true,

                               /*Extract=*/false, CostKind) +

      ScalarLoadsCost;

  InstructionCost LoadCost = 0;

  if (IsMasked) {

    LoadCost =

        TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,

                                  LI->getPointerAddressSpace(), CostKind);

  } else {

    LoadCost =

        TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,

                            LI->getPointerAddressSpace(), CostKind);

  }

  if (IsStrided && !IsMasked && Order.empty()) {

    // Check for potential segmented(interleaved) loads.

    VectorType *AlignedLoadVecTy = getWidenedType(

        ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));

    if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,

                                     DL, cast<LoadInst>(VL.back()), &AC, &DT,

                                     &TLI))

      AlignedLoadVecTy = LoadVecTy;

    if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],

                                         CommonAlignment,

                                         LI->getPointerAddressSpace())) {

      InstructionCost InterleavedCost =

          VectorGEPCost + TTI.getInterleavedMemoryOpCost(

                              Instruction::Load, AlignedLoadVecTy,

                              CompressMask[1], {}, CommonAlignment,

                              LI->getPointerAddressSpace(), CostKind, IsMasked);

      if (InterleavedCost < GatherCost) {

        InterleaveFactor = CompressMask[1];

        LoadVecTy = AlignedLoadVecTy;

        return true;

      }

    }

  }

  InstructionCost CompressCost = ::getShuffleCost(

      TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);

  if (!Order.empty()) {

    SmallVector<int> NewMask(Sz, PoisonMaskElem);

    for (unsigned I : seq<unsigned>(Sz)) {

      NewMask[I] = CompressMask[Mask[I]];

    }

    CompressMask.swap(NewMask);

  }

  InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;

  return TotalVecCost < GatherCost;

}


/// Checks if the \p VL can be transformed to a (masked)load + compress or

/// (masked) interleaved load.

static bool

isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,

                     ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,

                     const DataLayout &DL, ScalarEvolution &SE,

                     AssumptionCache &AC, const DominatorTree &DT,

                     const TargetLibraryInfo &TLI,

                     const function_ref<bool(Value *)> AreAllUsersVectorized) {

  bool IsMasked;

  unsigned InterleaveFactor;

  SmallVector<int> CompressMask;

  VectorType *LoadVecTy;

  return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,

                              AreAllUsersVectorized, IsMasked, InterleaveFactor,

                              CompressMask, LoadVecTy);

}


/// Checks if strided loads can be generated out of \p VL loads with pointers \p

/// PointerOps:

/// 1. Target with strided load support is detected.

/// 2. The number of loads is greater than MinProfitableStridedLoads, or the

/// potential stride <= MaxProfitableLoadStride and the potential stride is

/// power-of-2 (to avoid perf regressions for the very small number of loads)

/// and max distance > number of loads, or potential stride is -1.

/// 3. The loads are ordered, or number of unordered loads <=

/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is

/// to avoid extra costs for very expensive shuffles).

/// 4. Any pointer operand is an instruction with the users outside of the

/// current graph (for masked gathers extra extractelement instructions

/// might be required).

static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,

                          ArrayRef<unsigned> Order,

                          const TargetTransformInfo &TTI, const DataLayout &DL,

                          ScalarEvolution &SE,

                          const bool IsAnyPointerUsedOutGraph,

                          const int64_t Diff) {

  const size_t Sz = VL.size();

  const uint64_t AbsoluteDiff = std::abs(Diff);

  Type *ScalarTy = VL.front()->getType();

  auto *VecTy = getWidenedType(ScalarTy, Sz);

  if (IsAnyPointerUsedOutGraph ||

      (AbsoluteDiff > Sz &&

       (Sz > MinProfitableStridedLoads ||

        (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&

         AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||

      Diff == -(static_cast<int64_t>(Sz) - 1)) {

    int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);

    if (Diff != Stride * static_cast<int64_t>(Sz - 1))

      return false;

    Align Alignment =

        cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])

            ->getAlign();

    if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))

      return false;

    Value *Ptr0;

    Value *PtrN;

    if (Order.empty()) {

      Ptr0 = PointerOps.front();

      PtrN = PointerOps.back();

    } else {

      Ptr0 = PointerOps[Order.front()];

      PtrN = PointerOps[Order.back()];

    }

    // Iterate through all pointers and check if all distances are

    // unique multiple of Dist.

    SmallSet<int64_t, 4> Dists;

    for (Value *Ptr : PointerOps) {

      int64_t Dist = 0;

      if (Ptr == PtrN)

        Dist = Diff;

      else if (Ptr != Ptr0)

        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);

      // If the strides are not the same or repeated, we can't

      // vectorize.

      if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)

        break;

    }

    if (Dists.size() == Sz)

      return true;

  }

  return false;

}


BoUpSLP::LoadsState

BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

                           SmallVectorImpl<unsigned> &Order,

                           SmallVectorImpl<Value *> &PointerOps,

                           unsigned *BestVF, bool TryRecursiveCheck) const {

  // Check that a vectorized load would load the same memory as a scalar

  // load. For example, we don't want to vectorize loads that are smaller

  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

  // treats loading/storing it as an i8 struct. If we vectorize loads/stores

  // from such a struct, we read/write packed bits disagreeing with the

  // unvectorized version.

  if (BestVF)

    *BestVF = 0;

  if (areKnownNonVectorizableLoads(VL))

    return LoadsState::Gather;

  Type *ScalarTy = VL0->getType();


  if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))

    return LoadsState::Gather;


  // Make sure all loads in the bundle are simple - we can't vectorize

  // atomic or volatile loads.

  PointerOps.clear();

  const size_t Sz = VL.size();

  PointerOps.resize(Sz);

  auto *POIter = PointerOps.begin();

  for (Value *V : VL) {

    auto *L = dyn_cast<LoadInst>(V);

    if (!L || !L->isSimple())

      return LoadsState::Gather;

    *POIter = L->getPointerOperand();

    ++POIter;

  }


  Order.clear();

  // Check the order of pointer operands or that all pointers are the same.

  bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);


  auto *VecTy = getWidenedType(ScalarTy, Sz);

  Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);

  if (!IsSorted) {

    if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {

      if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&

          calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))

        return LoadsState::StridedVectorize;

    }


    if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

        TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))

      return LoadsState::Gather;


    if (!all_of(PointerOps, [&](Value *P) {

          return arePointersCompatible(P, PointerOps.front(), *TLI);

        }))

      return LoadsState::Gather;


  } else {

    Value *Ptr0;

    Value *PtrN;

    if (Order.empty()) {

      Ptr0 = PointerOps.front();

      PtrN = PointerOps.back();

    } else {

      Ptr0 = PointerOps[Order.front()];

      PtrN = PointerOps[Order.back()];

    }

    std::optional<int64_t> Diff =

        getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

    // Check that the sorted loads are consecutive.

    if (static_cast<uint64_t>(*Diff) == Sz - 1)

      return LoadsState::Vectorize;

    if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,

                             *TLI, [&](Value *V) {

                               return areAllUsersVectorized(

                                   cast<Instruction>(V), UserIgnoreList);

                             }))

      return LoadsState::CompressVectorize;

    // Simple check if not a strided access - clear order.

    bool IsPossibleStrided = *Diff % (Sz - 1) == 0;

    // Try to generate strided load node.

    auto IsAnyPointerUsedOutGraph =

        IsPossibleStrided && any_of(PointerOps, [&](Value *V) {

          return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {

                   return !isVectorized(U) && !MustGather.contains(U);

                 });

        });

    if (IsPossibleStrided &&

        isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,

                      IsAnyPointerUsedOutGraph, *Diff))

      return LoadsState::StridedVectorize;

  }

  if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

      TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))

    return LoadsState::Gather;

  // Correctly identify compare the cost of loads + shuffles rather than

  // strided/masked gather loads. Returns true if vectorized + shuffles

  // representation is better than just gather.

  auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,

                                                unsigned *BestVF,

                                                bool ProfitableGatherPointers) {

    if (BestVF)

      *BestVF = 0;

    // Compare masked gather cost and loads + insert subvector costs.

    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

    auto [ScalarGEPCost, VectorGEPCost] =

        getGEPCosts(TTI, PointerOps, PointerOps.front(),

                    Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);

    // Estimate the cost of masked gather GEP. If not a splat, roughly

    // estimate as a buildvector, otherwise estimate as splat.

    APInt DemandedElts = APInt::getAllOnes(Sz);

    Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();

    VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);

    if (static_cast<unsigned>(count_if(

            PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||

        any_of(PointerOps, [&](Value *V) {

          return getUnderlyingObject(V) !=

                 getUnderlyingObject(PointerOps.front());

        }))

      VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,

                                                DemandedElts, /*Insert=*/true,

                                                /*Extract=*/false, CostKind);

    else

      VectorGEPCost +=

          getScalarizationOverhead(

              TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),

              /*Insert=*/true, /*Extract=*/false, CostKind) +

          ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);

    // The cost of scalar loads.

    InstructionCost ScalarLoadsCost =

        std::accumulate(VL.begin(), VL.end(), InstructionCost(),

                        [&](InstructionCost C, Value *V) {

                          return C + TTI.getInstructionCost(

                                         cast<Instruction>(V), CostKind);

                        }) +

        ScalarGEPCost;

    // The cost of masked gather.

    InstructionCost MaskedGatherCost =

        TTI.getGatherScatterOpCost(

            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),

            /*VariableMask=*/false, CommonAlignment, CostKind) +

        (ProfitableGatherPointers ? 0 : VectorGEPCost);

    InstructionCost GatherCost =

        getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,

                                 /*Insert=*/true,

                                 /*Extract=*/false, CostKind) +

        ScalarLoadsCost;

    // The list of loads is small or perform partial check already - directly

    // compare masked gather cost and gather cost.

    constexpr unsigned ListLimit = 4;

    if (!TryRecursiveCheck || VL.size() < ListLimit)

      return MaskedGatherCost - GatherCost >= -SLPCostThreshold;


    // FIXME: The following code has not been updated for non-power-of-2

    // vectors (and not whole registers).  The splitting logic here does not

    // cover the original vector if the vector factor is not a power of two.

    if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))

      return false;


    unsigned Sz = DL->getTypeSizeInBits(ScalarTy);

    unsigned MinVF = getMinVF(2 * Sz);

    DemandedElts.clearAllBits();

    // Iterate through possible vectorization factors and check if vectorized +

    // shuffles is better than just gather.

    for (unsigned VF =

             getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);

         VF >= MinVF;

         VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {

      SmallVector<LoadsState> States;

      for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {

        ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

        SmallVector<unsigned> Order;

        SmallVector<Value *> PointerOps;

        LoadsState LS =

            canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,

                              /*TryRecursiveCheck=*/false);

        // Check that the sorted loads are consecutive.

        if (LS == LoadsState::Gather) {

          if (BestVF) {

            DemandedElts.setAllBits();

            break;

          }

          DemandedElts.setBits(Cnt, Cnt + VF);

          continue;

        }

        // If need the reorder - consider as high-cost masked gather for now.

        if ((LS == LoadsState::Vectorize ||

             LS == LoadsState::StridedVectorize ||

             LS == LoadsState::CompressVectorize) &&

            !Order.empty() && !isReverseOrder(Order))

          LS = LoadsState::ScatterVectorize;

        States.push_back(LS);

      }

      if (DemandedElts.isAllOnes())

        // All loads gathered - try smaller VF.

        continue;

      // Can be vectorized later as a serie of loads/insertelements.

      InstructionCost VecLdCost = 0;

      if (!DemandedElts.isZero()) {

        VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,

                                             /*Insert=*/true,

                                             /*Extract=*/false, CostKind) +

                    ScalarGEPCost;

        for (unsigned Idx : seq<unsigned>(VL.size()))

          if (DemandedElts[Idx])

            VecLdCost +=

                TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);

      }

      auto *SubVecTy = getWidenedType(ScalarTy, VF);

      for (auto [I, LS] : enumerate(States)) {

        auto *LI0 = cast<LoadInst>(VL[I * VF]);

        InstructionCost VectorGEPCost =

            (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)

                ? 0

                : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),

                              LI0->getPointerOperand(),

                              Instruction::GetElementPtr, CostKind, ScalarTy,

                              SubVecTy)

                      .second;

        if (LS == LoadsState::ScatterVectorize) {

          if (static_cast<unsigned>(

                  count_if(PointerOps, IsaPred<GetElementPtrInst>)) <

                  PointerOps.size() - 1 ||

              any_of(PointerOps, [&](Value *V) {

                return getUnderlyingObject(V) !=

                       getUnderlyingObject(PointerOps.front());

              }))

            VectorGEPCost += getScalarizationOverhead(

                TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),

                /*Insert=*/true, /*Extract=*/false, CostKind);

          else

            VectorGEPCost +=

                getScalarizationOverhead(

                    TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),

                    /*Insert=*/true, /*Extract=*/false, CostKind) +

                ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},

                                 CostKind);

        }

        switch (LS) {

        case LoadsState::Vectorize:

          VecLdCost +=

              TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),

                                  LI0->getPointerAddressSpace(), CostKind,

                                  TTI::OperandValueInfo()) +

              VectorGEPCost;

          break;

        case LoadsState::StridedVectorize:

          VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,

                                                  LI0->getPointerOperand(),

                                                  /*VariableMask=*/false,

                                                  CommonAlignment, CostKind) +

                       VectorGEPCost;

          break;

        case LoadsState::CompressVectorize:

          VecLdCost += TTI.getMaskedMemoryOpCost(

                           Instruction::Load, SubVecTy, CommonAlignment,

                           LI0->getPointerAddressSpace(), CostKind) +

                       VectorGEPCost +

                       ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,

                                        {}, CostKind);

          break;

        case LoadsState::ScatterVectorize:

          VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,

                                                  LI0->getPointerOperand(),

                                                  /*VariableMask=*/false,

                                                  CommonAlignment, CostKind) +

                       VectorGEPCost;

          break;

        case LoadsState::Gather:

          // Gathers are already calculated - ignore.

          continue;

        }

        SmallVector<int> ShuffleMask(VL.size());

        for (int Idx : seq<int>(0, VL.size()))

          ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;

        if (I > 0)

          VecLdCost +=

              ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,

                               CostKind, I * VF, SubVecTy);

      }

      // If masked gather cost is higher - better to vectorize, so

      // consider it as a gather node. It will be better estimated

      // later.

      if (MaskedGatherCost >= VecLdCost &&

          VecLdCost - GatherCost < -SLPCostThreshold) {

        if (BestVF)

          *BestVF = VF;

        return true;

      }

    }

    return MaskedGatherCost - GatherCost >= -SLPCostThreshold;

  };

  // TODO: need to improve analysis of the pointers, if not all of them are

  // GEPs or have > 2 operands, we end up with a gather node, which just

  // increases the cost.

  Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());

  bool ProfitableGatherPointers =

      L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {

                       return L->isLoopInvariant(V);

                     })) <= Sz / 2;

  if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {

        auto *GEP = dyn_cast<GetElementPtrInst>(P);

        return (!GEP && doesNotNeedToBeScheduled(P)) ||

               (GEP && GEP->getNumOperands() == 2 &&

                isa<Constant, Instruction>(GEP->getOperand(1)));

      })) {

    // Check if potential masked gather can be represented as series

    // of loads + insertsubvectors.

    // If masked gather cost is higher - better to vectorize, so

    // consider it as a gather node. It will be better estimated

    // later.

    if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,

                                                     ProfitableGatherPointers))

      return LoadsState::ScatterVectorize;

  }


  return LoadsState::Gather;

}


static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,

                                   ArrayRef<BasicBlock *> BBs, Type *ElemTy,

                                   const DataLayout &DL, ScalarEvolution &SE,

                                   SmallVectorImpl<unsigned> &SortedIndices) {

  assert(

      all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&

      "Expected list of pointer operands.");

  // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each

  // Ptr into, sort and return the sorted indices with values next to one

  // another.

  SmallMapVector<

      std::pair<BasicBlock *, Value *>,

      SmallVector<SmallVector<std::tuple<Value *, int64_t, unsigned>>>, 8>

      Bases;

  Bases

      .try_emplace(std::make_pair(

          BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth)))

      .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);


  SortedIndices.clear();

  for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {

    auto Key = std::make_pair(BBs[Cnt + 1],

                              getUnderlyingObject(Ptr, RecursionMaxDepth));

    bool Found = any_of(Bases.try_emplace(Key).first->second,

                        [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {

                          std::optional<int64_t> Diff =

                              getPointersDiff(ElemTy, std::get<0>(Base.front()),

                                              ElemTy, Ptr, DL, SE,

                                              /*StrictCheck=*/true);

                          if (!Diff)

                            return false;


                          Base.emplace_back(Ptr, *Diff, Cnt + 1);

                          return true;

                        });


    if (!Found) {

      // If we haven't found enough to usefully cluster, return early.

      if (Bases.size() > VL.size() / 2 - 1)

        return false;


      // Not found already - add a new Base

      Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);

    }

  }


  if (Bases.size() == VL.size())

    return false;


  if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||

                            Bases.front().second.size() == VL.size()))

    return false;


  // For each of the bases sort the pointers by Offset and check if any of the

  // base become consecutively allocated.

  auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {

    SmallPtrSet<Value *, 13> FirstPointers;

    SmallPtrSet<Value *, 13> SecondPointers;

    Value *P1 = Ptr1;

    Value *P2 = Ptr2;

    unsigned Depth = 0;

    while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {

      if (P1 == P2 || Depth > RecursionMaxDepth)

        return false;

      FirstPointers.insert(P1);

      SecondPointers.insert(P2);

      P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);

      P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);

      ++Depth;

    }

    assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&

           "Unable to find matching root.");

    return FirstPointers.contains(P2) && !SecondPointers.contains(P1);

  };

  for (auto &Base : Bases) {

    for (auto &Vec : Base.second) {

      if (Vec.size() > 1) {

        stable_sort(Vec, llvm::less_second());

        int64_t InitialOffset = std::get<1>(Vec[0]);

        bool AnyConsecutive =

            all_of(enumerate(Vec), [InitialOffset](const auto &P) {

              return std::get<1>(P.value()) ==

                     int64_t(P.index()) + InitialOffset;

            });

        // Fill SortedIndices array only if it looks worth-while to sort the

        // ptrs.

        if (!AnyConsecutive)

          return false;

      }

    }

    stable_sort(Base.second, [&](const auto &V1, const auto &V2) {

      return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));

    });

  }


  for (auto &T : Bases)

    for (const auto &Vec : T.second)

      for (const auto &P : Vec)

        SortedIndices.push_back(std::get<2>(P));


  assert(SortedIndices.size() == VL.size() &&

         "Expected SortedIndices to be the size of VL");

  return true;

}


std::optional<BoUpSLP::OrdersType>

BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {

  assert(TE.isGather() && "Expected gather node only.");

  Type *ScalarTy = TE.Scalars[0]->getType();


  SmallVector<Value *> Ptrs;

  Ptrs.reserve(TE.Scalars.size());

  SmallVector<BasicBlock *> BBs;

  BBs.reserve(TE.Scalars.size());

  for (Value *V : TE.Scalars) {

    auto *L = dyn_cast<LoadInst>(V);

    if (!L || !L->isSimple())

      return std::nullopt;

    Ptrs.push_back(L->getPointerOperand());

    BBs.push_back(L->getParent());

  }


  BoUpSLP::OrdersType Order;

  if (!LoadEntriesToVectorize.contains(TE.Idx) &&

      clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))

    return std::move(Order);

  return std::nullopt;

}


/// Check if two insertelement instructions are from the same buildvector.

static bool areTwoInsertFromSameBuildVector(

    InsertElementInst *VU, InsertElementInst *V,

    function_ref<Value *(InsertElementInst *)> GetBaseOperand) {

  // Instructions must be from the same basic blocks.

  if (VU->getParent() != V->getParent())

    return false;

  // Checks if 2 insertelements are from the same buildvector.

  if (VU->getType() != V->getType())

    return false;

  // Multiple used inserts are separate nodes.

  if (!VU->hasOneUse() && !V->hasOneUse())

    return false;

  auto *IE1 = VU;

  auto *IE2 = V;

  std::optional<unsigned> Idx1 = getElementIndex(IE1);

  std::optional<unsigned> Idx2 = getElementIndex(IE2);

  if (Idx1 == std::nullopt || Idx2 == std::nullopt)

    return false;

  // Go through the vector operand of insertelement instructions trying to find

  // either VU as the original vector for IE2 or V as the original vector for

  // IE1.

  SmallBitVector ReusedIdx(

      cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());

  bool IsReusedIdx = false;

  do {

    if (IE2 == VU && !IE1)

      return VU->hasOneUse();

    if (IE1 == V && !IE2)

      return V->hasOneUse();

    if (IE1 && IE1 != V) {

      unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);

      IsReusedIdx |= ReusedIdx.test(Idx1);

      ReusedIdx.set(Idx1);

      if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)

        IE1 = nullptr;

      else

        IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));

    }

    if (IE2 && IE2 != VU) {

      unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);

      IsReusedIdx |= ReusedIdx.test(Idx2);

      ReusedIdx.set(Idx2);

      if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)

        IE2 = nullptr;

      else

        IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));

    }

  } while (!IsReusedIdx && (IE1 || IE2));

  return false;

}


/// Checks if the specified instruction \p I is an alternate operation for

/// the given \p MainOp and \p AltOp instructions.

static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,

                                   Instruction *AltOp,

                                   const TargetLibraryInfo &TLI);


std::optional<BoUpSLP::OrdersType>

BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,

                           bool IgnoreReorder) {

  // No need to reorder if need to shuffle reuses, still need to shuffle the

  // node.

  if (!TE.ReuseShuffleIndices.empty()) {

    // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.

    assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&

           "Reshuffling scalars not yet supported for nodes with padding");


    if (isSplat(TE.Scalars))

      return std::nullopt;

    // Check if reuse shuffle indices can be improved by reordering.

    // For this, check that reuse mask is "clustered", i.e. each scalar values

    // is used once in each submask of size <number_of_scalars>.

    // Example: 4 scalar values.

    // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.

    //                           0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because

    //                           element 3 is used twice in the second submask.

    unsigned Sz = TE.Scalars.size();

    if (TE.isGather()) {

      if (std::optional<OrdersType> CurrentOrder =

              findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {

        SmallVector<int> Mask;

        fixupOrderingIndices(*CurrentOrder);

        inversePermutation(*CurrentOrder, Mask);

        ::addMask(Mask, TE.ReuseShuffleIndices);

        OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());

        unsigned Sz = TE.Scalars.size();

        for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {

          for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))

            if (Idx != PoisonMaskElem)

              Res[Idx + K * Sz] = I + K * Sz;

        }

        return std::move(Res);

      }

    }

    if (Sz == 2 && TE.getVectorFactor() == 4 &&

        ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),

                                                2 * TE.getVectorFactor())) == 1)

      return std::nullopt;

    if (TE.ReuseShuffleIndices.size() % Sz != 0)

      return std::nullopt;

    if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

                                                     Sz)) {

      SmallVector<int> ReorderMask(Sz, PoisonMaskElem);

      if (TE.ReorderIndices.empty())

        std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

      else

        inversePermutation(TE.ReorderIndices, ReorderMask);

      ::addMask(ReorderMask, TE.ReuseShuffleIndices);

      unsigned VF = ReorderMask.size();

      OrdersType ResOrder(VF, VF);

      unsigned NumParts = divideCeil(VF, Sz);

      SmallBitVector UsedVals(NumParts);

      for (unsigned I = 0; I < VF; I += Sz) {

        int Val = PoisonMaskElem;

        unsigned UndefCnt = 0;

        unsigned Limit = std::min(Sz, VF - I);

        if (any_of(ArrayRef(ReorderMask).slice(I, Limit),

                   [&](int Idx) {

                     if (Val == PoisonMaskElem && Idx != PoisonMaskElem)

                       Val = Idx;

                     if (Idx == PoisonMaskElem)

                       ++UndefCnt;

                     return Idx != PoisonMaskElem && Idx != Val;

                   }) ||

            Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||

            UndefCnt > Sz / 2)

          return std::nullopt;

        UsedVals.set(Val);

        for (unsigned K = 0; K < NumParts; ++K) {

          unsigned Idx = Val + Sz * K;

          if (Idx < VF && I + K < VF)

            ResOrder[Idx] = I + K;

        }

      }

      return std::move(ResOrder);

    }

    unsigned VF = TE.getVectorFactor();

    // Try build correct order for extractelement instructions.

    SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),

                                TE.ReuseShuffleIndices.end());

    if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&

        all_of(TE.Scalars, [Sz](Value *V) {

          if (isa<PoisonValue>(V))

            return true;

          std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));

          return Idx && *Idx < Sz;

        })) {

      assert(!TE.isAltShuffle() && "Alternate instructions are only supported "

                                   "by BinaryOperator and CastInst.");

      SmallVector<int> ReorderMask(Sz, PoisonMaskElem);

      if (TE.ReorderIndices.empty())

        std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

      else

        inversePermutation(TE.ReorderIndices, ReorderMask);

      for (unsigned I = 0; I < VF; ++I) {

        int &Idx = ReusedMask[I];

        if (Idx == PoisonMaskElem)

          continue;

        Value *V = TE.Scalars[ReorderMask[Idx]];

        std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));

        Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));

      }

    }

    // Build the order of the VF size, need to reorder reuses shuffles, they are

    // always of VF size.

    OrdersType ResOrder(VF);

    std::iota(ResOrder.begin(), ResOrder.end(), 0);

    auto *It = ResOrder.begin();

    for (unsigned K = 0; K < VF; K += Sz) {

      OrdersType CurrentOrder(TE.ReorderIndices);

      SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};

      if (SubMask.front() == PoisonMaskElem)

        std::iota(SubMask.begin(), SubMask.end(), 0);

      reorderOrder(CurrentOrder, SubMask);

      transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });

      std::advance(It, Sz);

    }

    if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {

          return Data.index() == Data.value();

        }))

      return std::nullopt; // No need to reorder.

    return std::move(ResOrder);

  }

  if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&

      (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||

       !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&

      (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))

    return std::nullopt;

  if (TE.State == TreeEntry::SplitVectorize ||

      ((TE.State == TreeEntry::Vectorize ||

        TE.State == TreeEntry::StridedVectorize ||

        TE.State == TreeEntry::CompressVectorize) &&

       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {

    assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&

           "Alternate instructions are only supported by "

           "BinaryOperator and CastInst.");

    return TE.ReorderIndices;

  }

  if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&

      TE.isAltShuffle()) {

    assert(TE.ReuseShuffleIndices.empty() &&

           "ReuseShuffleIndices should be "

           "empty for alternate instructions.");

    SmallVector<int> Mask;

    TE.buildAltOpShuffleMask(

        [&](Instruction *I) {

          assert(TE.getMatchingMainOpOrAltOp(I) &&

                 "Unexpected main/alternate opcode");

          return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);

        },

        Mask);

    const int VF = TE.getVectorFactor();

    OrdersType ResOrder(VF, VF);

    for (unsigned I : seq<unsigned>(VF)) {

      if (Mask[I] == PoisonMaskElem)

        continue;

      ResOrder[Mask[I] % VF] = I;

    }

    return std::move(ResOrder);

  }

  if (!TE.ReorderIndices.empty())

    return TE.ReorderIndices;

  if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {

    if (!TE.ReorderIndices.empty())

      return TE.ReorderIndices;


    SmallVector<Instruction *> UserBVHead(TE.Scalars.size());

    for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {

      if (isa<Constant>(V) || !V->hasNUsesOrMore(1))

        continue;

      auto *II = dyn_cast<InsertElementInst>(*V->user_begin());

      if (!II)

        continue;

      Instruction *BVHead = nullptr;

      BasicBlock *BB = II->getParent();

      while (II && II->hasOneUse() && II->getParent() == BB) {

        BVHead = II;

        II = dyn_cast<InsertElementInst>(II->getOperand(0));

      }

      I = BVHead;

    }


    auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {

      assert(BB1 != BB2 && "Expected different basic blocks.");

      if (!DT->isReachableFromEntry(BB1))

        return false;

      if (!DT->isReachableFromEntry(BB2))

        return true;

      auto *NodeA = DT->getNode(BB1);

      auto *NodeB = DT->getNode(BB2);

      assert(NodeA && "Should only process reachable instructions");

      assert(NodeB && "Should only process reachable instructions");

      assert((NodeA == NodeB) ==

                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

             "Different nodes should have different DFS numbers");

      return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();

    };

    auto PHICompare = [&](unsigned I1, unsigned I2) {

      Value *V1 = TE.Scalars[I1];

      Value *V2 = TE.Scalars[I2];

      if (V1 == V2 || (V1->use_empty() && V2->use_empty()))

        return false;

      if (isa<PoisonValue>(V1))

        return true;

      if (isa<PoisonValue>(V2))

        return false;

      if (V1->getNumUses() < V2->getNumUses())

        return true;

      if (V1->getNumUses() > V2->getNumUses())

        return false;

      auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());

      auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());

      if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())

        return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),

                                    FirstUserOfPhi2->getParent());

      auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);

      auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);

      auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);

      auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);

      if (IE1 && !IE2)

        return true;

      if (!IE1 && IE2)

        return false;

      if (IE1 && IE2) {

        if (UserBVHead[I1] && !UserBVHead[I2])

          return true;

        if (!UserBVHead[I1])

          return false;

        if (UserBVHead[I1] == UserBVHead[I2])

          return getElementIndex(IE1) < getElementIndex(IE2);

        if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())

          return CompareByBasicBlocks(UserBVHead[I1]->getParent(),

                                      UserBVHead[I2]->getParent());

        return UserBVHead[I1]->comesBefore(UserBVHead[I2]);

      }

      if (EE1 && !EE2)

        return true;

      if (!EE1 && EE2)

        return false;

      if (EE1 && EE2) {

        auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));

        auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));

        auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));

        auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));

        if (!Inst2 && !P2)

          return Inst1 || P1;

        if (EE1->getOperand(0) == EE2->getOperand(0))

          return getElementIndex(EE1) < getElementIndex(EE2);

        if (!Inst1 && Inst2)

          return false;

        if (Inst1 && Inst2) {

          if (Inst1->getParent() != Inst2->getParent())

            return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());

          return Inst1->comesBefore(Inst2);

        }

        if (!P1 && P2)

          return false;

        assert(P1 && P2 &&

               "Expected either instructions or arguments vector operands.");

        return P1->getArgNo() < P2->getArgNo();

      }

      return false;

    };

    OrdersType Phis(TE.Scalars.size());

    std::iota(Phis.begin(), Phis.end(), 0);

    stable_sort(Phis, PHICompare);

    if (isIdentityOrder(Phis))

      return std::nullopt; // No need to reorder.

    return std::move(Phis);

  }

  if (TE.isGather() &&

      (!TE.hasState() || !TE.isAltShuffle() ||

       ScalarsInSplitNodes.contains(TE.getMainOp())) &&

      allSameType(TE.Scalars)) {

    // TODO: add analysis of other gather nodes with extractelement

    // instructions and other values/instructions, not only undefs.

    if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||

         (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&

          any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&

        all_of(TE.Scalars, [](Value *V) {

          auto *EE = dyn_cast<ExtractElementInst>(V);

          return !EE || isa<FixedVectorType>(EE->getVectorOperandType());

        })) {

      // Check that gather of extractelements can be represented as

      // just a shuffle of a single vector.

      OrdersType CurrentOrder;

      bool Reuse =

          canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);

      if (Reuse || !CurrentOrder.empty())

        return std::move(CurrentOrder);

    }

    // If the gather node is <undef, v, .., poison> and

    // insertelement poison, v, 0 [+ permute]

    // is cheaper than

    // insertelement poison, v, n - try to reorder.

    // If rotating the whole graph, exclude the permute cost, the whole graph

    // might be transformed.

    int Sz = TE.Scalars.size();

    if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&

        count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {

      const auto *It = find_if_not(TE.Scalars, isConstant);

      if (It == TE.Scalars.begin())

        return OrdersType();

      auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);

      if (It != TE.Scalars.end()) {

        OrdersType Order(Sz, Sz);

        unsigned Idx = std::distance(TE.Scalars.begin(), It);

        Order[Idx] = 0;

        fixupOrderingIndices(Order);

        SmallVector<int> Mask;

        inversePermutation(Order, Mask);

        InstructionCost PermuteCost =

            TopToBottom

                ? 0

                : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);

        InstructionCost InsertFirstCost = TTI->getVectorInstrCost(

            Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,

            PoisonValue::get(Ty), *It);

        InstructionCost InsertIdxCost = TTI->getVectorInstrCost(

            Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,

            PoisonValue::get(Ty), *It);

        if (InsertFirstCost + PermuteCost < InsertIdxCost) {

          OrdersType Order(Sz, Sz);

          Order[Idx] = 0;

          return std::move(Order);

        }

      }

    }

    if (isSplat(TE.Scalars))

      return std::nullopt;

    if (TE.Scalars.size() >= 3)

      if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))

        return Order;

    // Check if can include the order of vectorized loads. For masked gathers do

    // extra analysis later, so include such nodes into a special list.

    if (TE.hasState() && TE.getOpcode() == Instruction::Load) {

      SmallVector<Value *> PointerOps;

      OrdersType CurrentOrder;

      LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),

                                         CurrentOrder, PointerOps);

      if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||

          Res == LoadsState::CompressVectorize)

        return std::move(CurrentOrder);

    }

    // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars

    // has been auditted for correctness with non-power-of-two vectors.

    if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))

      if (std::optional<OrdersType> CurrentOrder =

              findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))

        return CurrentOrder;

  }

  return std::nullopt;

}


/// Checks if the given mask is a "clustered" mask with the same clusters of

/// size \p Sz, which are not identity submasks.

static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,

                                               unsigned Sz) {

  ArrayRef<int> FirstCluster = Mask.slice(0, Sz);

  if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))

    return false;

  for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {

    ArrayRef<int> Cluster = Mask.slice(I, Sz);

    if (Cluster != FirstCluster)

      return false;

  }

  return true;

}


void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {

  // Reorder reuses mask.

  reorderReuses(TE.ReuseShuffleIndices, Mask);

  const unsigned Sz = TE.Scalars.size();

  // For vectorized and non-clustered reused no need to do anything else.

  if (!TE.isGather() ||

      !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

                                                   Sz) ||

      !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))

    return;

  SmallVector<int> NewMask;

  inversePermutation(TE.ReorderIndices, NewMask);

  addMask(NewMask, TE.ReuseShuffleIndices);

  // Clear reorder since it is going to be applied to the new mask.

  TE.ReorderIndices.clear();

  // Try to improve gathered nodes with clustered reuses, if possible.

  ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);

  SmallVector<unsigned> NewOrder(Slice);

  inversePermutation(NewOrder, NewMask);

  reorderScalars(TE.Scalars, NewMask);

  // Fill the reuses mask with the identity submasks.

  for (auto *It = TE.ReuseShuffleIndices.begin(),

            *End = TE.ReuseShuffleIndices.end();

       It != End; std::advance(It, Sz))

    std::iota(It, std::next(It, Sz), 0);

}


static void combineOrders(MutableArrayRef<unsigned> Order,

                          ArrayRef<unsigned> SecondaryOrder) {

  assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&

         "Expected same size of orders");

  size_t Sz = Order.size();

  SmallBitVector UsedIndices(Sz);

  for (unsigned Idx : seq<unsigned>(0, Sz)) {

    if (Order[Idx] != Sz)

      UsedIndices.set(Order[Idx]);

  }

  if (SecondaryOrder.empty()) {

    for (unsigned Idx : seq<unsigned>(0, Sz))

      if (Order[Idx] == Sz && !UsedIndices.test(Idx))

        Order[Idx] = Idx;

  } else {

    for (unsigned Idx : seq<unsigned>(0, Sz))

      if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&

          !UsedIndices.test(SecondaryOrder[Idx]))

        Order[Idx] = SecondaryOrder[Idx];

  }

}


bool BoUpSLP::isProfitableToReorder() const {

  constexpr unsigned TinyVF = 2;

  constexpr unsigned TinyTree = 10;

  constexpr unsigned PhiOpsLimit = 12;

  constexpr unsigned GatherLoadsLimit = 2;

  if (VectorizableTree.size() <= TinyTree)

    return true;

  if (VectorizableTree.front()->hasState() &&

      !VectorizableTree.front()->isGather() &&

      (VectorizableTree.front()->getOpcode() == Instruction::Store ||

       VectorizableTree.front()->getOpcode() == Instruction::PHI ||

       (VectorizableTree.front()->getVectorFactor() <= TinyVF &&

        (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||

         VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&

      VectorizableTree.front()->ReorderIndices.empty()) {

    // Check if the tree has only single store and single (unordered) load node,

    // other nodes are phis or geps/binops, combined with phis, and/or single

    // gather load node

    if (VectorizableTree.front()->hasState() &&

        VectorizableTree.front()->getOpcode() == Instruction::PHI &&

        VectorizableTree.front()->Scalars.size() == TinyVF &&

        VectorizableTree.front()->getNumOperands() > PhiOpsLimit)

      return false;

    // Single node, which require reorder - skip.

    if (VectorizableTree.front()->hasState() &&

        VectorizableTree.front()->getOpcode() == Instruction::Store &&

        VectorizableTree.front()->ReorderIndices.empty()) {

      const unsigned ReorderedSplitsCnt =

          count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

            return TE->State == TreeEntry::SplitVectorize &&

                   !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&

                   TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&

                   ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());

          });

      if (ReorderedSplitsCnt <= 1 &&

          static_cast<unsigned>(count_if(

              VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

                return ((!TE->isGather() &&

                         (TE->ReorderIndices.empty() ||

                          (TE->UserTreeIndex.UserTE &&

                           TE->UserTreeIndex.UserTE->State ==

                               TreeEntry::Vectorize &&

                           !TE->UserTreeIndex.UserTE->ReuseShuffleIndices

                                .empty()))) ||

                        (TE->isGather() && TE->ReorderIndices.empty() &&

                         (!TE->hasState() || TE->isAltShuffle() ||

                          TE->getOpcode() == Instruction::Load ||

                          TE->getOpcode() == Instruction::ZExt ||

                          TE->getOpcode() == Instruction::SExt))) &&

                       (VectorizableTree.front()->getVectorFactor() > TinyVF ||

                        !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {

                          return !isConstant(V) && isVectorized(V);

                        }));

              })) >= VectorizableTree.size() - ReorderedSplitsCnt)

        return false;

    }

    bool HasPhis = false;

    bool HasLoad = true;

    unsigned GatherLoads = 0;

    for (const std::unique_ptr<TreeEntry> &TE :

         ArrayRef(VectorizableTree).drop_front()) {

      if (TE->State == TreeEntry::SplitVectorize)

        continue;

      if (!TE->hasState()) {

        if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||

            all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))

          continue;

        if (VectorizableTree.front()->Scalars.size() == TinyVF &&

            any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))

          continue;

        return true;

      }

      if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {

        if (!TE->isGather()) {

          HasLoad = false;

          continue;

        }

        if (HasLoad)

          return true;

        ++GatherLoads;

        if (GatherLoads >= GatherLoadsLimit)

          return true;

      }

      if (TE->getOpcode() == Instruction::GetElementPtr ||

          Instruction::isBinaryOp(TE->getOpcode()))

        continue;

      if (TE->getOpcode() != Instruction::PHI &&

          (!TE->hasCopyableElements() ||

           static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <

               TE->Scalars.size() / 2))

        return true;

      if (VectorizableTree.front()->Scalars.size() == TinyVF &&

          TE->getNumOperands() > PhiOpsLimit)

        return false;

      HasPhis = true;

    }

    return !HasPhis;

  }

  return true;

}


void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,

                                          ArrayRef<int> MaskOrder) {

  assert(State == TreeEntry::SplitVectorize && "Expected split user node.");

  SmallVector<int> NewMask(getVectorFactor());

  SmallVector<int> NewMaskOrder(getVectorFactor());

  std::iota(NewMask.begin(), NewMask.end(), 0);

  std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);

  if (Idx == 0) {

    copy(Mask, NewMask.begin());

    copy(MaskOrder, NewMaskOrder.begin());

  } else {

    assert(Idx == 1 && "Expected either 0 or 1 index.");

    unsigned Offset = CombinedEntriesWithIndices.back().second;

    for (unsigned I : seq<unsigned>(Mask.size())) {

      NewMask[I + Offset] = Mask[I] + Offset;

      NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;

    }

  }

  reorderScalars(Scalars, NewMask);

  reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);

  if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))

    ReorderIndices.clear();

}


void BoUpSLP::reorderTopToBottom() {

  // Maps VF to the graph nodes.

  DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;

  // ExtractElement gather nodes which can be vectorized and need to handle

  // their ordering.

  DenseMap<const TreeEntry *, OrdersType> GathersToOrders;


  // Phi nodes can have preferred ordering based on their result users

  DenseMap<const TreeEntry *, OrdersType> PhisToOrders;


  // AltShuffles can also have a preferred ordering that leads to fewer

  // instructions, e.g., the addsub instruction in x86.

  DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;


  // Maps a TreeEntry to the reorder indices of external users.

  DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>

      ExternalUserReorderMap;

  // Find all reorderable nodes with the given VF.

  // Currently the are vectorized stores,loads,extracts + some gathering of

  // extracts.

  for_each(VectorizableTree, [&, &TTIRef = *TTI](

                                 const std::unique_ptr<TreeEntry> &TE) {

    // Look for external users that will probably be vectorized.

    SmallVector<OrdersType, 1> ExternalUserReorderIndices =

        findExternalStoreUsersReorderIndices(TE.get());

    if (!ExternalUserReorderIndices.empty()) {

      VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

      ExternalUserReorderMap.try_emplace(TE.get(),

                                         std::move(ExternalUserReorderIndices));

    }


    // Patterns like [fadd,fsub] can be combined into a single instruction in

    // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need

    // to take into account their order when looking for the most used order.

    if (TE->hasState() && TE->isAltShuffle() &&

        TE->State != TreeEntry::SplitVectorize) {

      Type *ScalarTy = TE->Scalars[0]->getType();

      VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());

      unsigned Opcode0 = TE->getOpcode();

      unsigned Opcode1 = TE->getAltOpcode();

      SmallBitVector OpcodeMask(

          getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));

      // If this pattern is supported by the target then we consider the order.

      if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

        VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

        AltShufflesToOrders.try_emplace(TE.get(), OrdersType());

      }

      // TODO: Check the reverse order too.

    }


    bool IgnoreReorder =

        !UserIgnoreList && VectorizableTree.front()->hasState() &&

        (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||

         VectorizableTree.front()->getOpcode() == Instruction::Store);

    if (std::optional<OrdersType> CurrentOrder =

            getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {

      // Do not include ordering for nodes used in the alt opcode vectorization,

      // better to reorder them during bottom-to-top stage. If follow the order

      // here, it causes reordering of the whole graph though actually it is

      // profitable just to reorder the subgraph that starts from the alternate

      // opcode vectorization node. Such nodes already end-up with the shuffle

      // instruction and it is just enough to change this shuffle rather than

      // rotate the scalars for the whole graph.

      unsigned Cnt = 0;

      const TreeEntry *UserTE = TE.get();

      while (UserTE && Cnt < RecursionMaxDepth) {

        if (!UserTE->UserTreeIndex)

          break;

        if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&

            UserTE->UserTreeIndex.UserTE->isAltShuffle() &&

            UserTE->UserTreeIndex.UserTE->Idx != 0)

          return;

        UserTE = UserTE->UserTreeIndex.UserTE;

        ++Cnt;

      }

      VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

      if (!(TE->State == TreeEntry::Vectorize ||

            TE->State == TreeEntry::StridedVectorize ||

            TE->State == TreeEntry::SplitVectorize ||

            TE->State == TreeEntry::CompressVectorize) ||

          !TE->ReuseShuffleIndices.empty())

        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

      if (TE->State == TreeEntry::Vectorize &&

          TE->getOpcode() == Instruction::PHI)

        PhisToOrders.try_emplace(TE.get(), *CurrentOrder);

    }

  });


  // Reorder the graph nodes according to their vectorization factor.

  for (unsigned VF = VectorizableTree.front()->getVectorFactor();

       !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {

    auto It = VFToOrderedEntries.find(VF);

    if (It == VFToOrderedEntries.end())

      continue;

    // Try to find the most profitable order. We just are looking for the most

    // used order and reorder scalar elements in the nodes according to this

    // mostly used order.

    ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();

    // Delete VF entry upon exit.

    auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });


    // All operands are reordered and used only in this node - propagate the

    // most used order to the user node.

    MapVector<OrdersType, unsigned,

              DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

        OrdersUses;

    for (const TreeEntry *OpTE : OrderedEntries) {

      // No need to reorder this nodes, still need to extend and to use shuffle,

      // just need to merge reordering shuffle and the reuse shuffle.

      if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&

          OpTE->State != TreeEntry::SplitVectorize)

        continue;

      // Count number of orders uses.

      const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,

                           &PhisToOrders]() -> const OrdersType & {

        if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {

          auto It = GathersToOrders.find(OpTE);

          if (It != GathersToOrders.end())

            return It->second;

        }

        if (OpTE->hasState() && OpTE->isAltShuffle()) {

          auto It = AltShufflesToOrders.find(OpTE);

          if (It != AltShufflesToOrders.end())

            return It->second;

        }

        if (OpTE->State == TreeEntry::Vectorize &&

            OpTE->getOpcode() == Instruction::PHI) {

          auto It = PhisToOrders.find(OpTE);

          if (It != PhisToOrders.end())

            return It->second;

        }

        return OpTE->ReorderIndices;

      }();

      // First consider the order of the external scalar users.

      auto It = ExternalUserReorderMap.find(OpTE);

      if (It != ExternalUserReorderMap.end()) {

        const auto &ExternalUserReorderIndices = It->second;

        // If the OpTE vector factor != number of scalars - use natural order,

        // it is an attempt to reorder node with reused scalars but with

        // external uses.

        if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {

          OrdersUses.try_emplace(OrdersType(), 0).first->second +=

              ExternalUserReorderIndices.size();

        } else {

          for (const OrdersType &ExtOrder : ExternalUserReorderIndices)

            ++OrdersUses.try_emplace(ExtOrder, 0).first->second;

        }

        // No other useful reorder data in this entry.

        if (Order.empty())

          continue;

      }

      // Stores actually store the mask, not the order, need to invert.

      if (OpTE->State == TreeEntry::Vectorize &&

          OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

        assert(!OpTE->isAltShuffle() &&

               "Alternate instructions are only supported by BinaryOperator "

               "and CastInst.");

        SmallVector<int> Mask;

        inversePermutation(Order, Mask);

        unsigned E = Order.size();

        OrdersType CurrentOrder(E, E);

        transform(Mask, CurrentOrder.begin(), [E](int Idx) {

          return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

        });

        fixupOrderingIndices(CurrentOrder);

        ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;

      } else {

        ++OrdersUses.try_emplace(Order, 0).first->second;

      }

    }

    if (OrdersUses.empty())

      continue;

    // Choose the most used order.

    unsigned IdentityCnt = 0;

    unsigned FilledIdentityCnt = 0;

    OrdersType IdentityOrder(VF, VF);

    for (auto &Pair : OrdersUses) {

      if (Pair.first.empty() || isIdentityOrder(Pair.first)) {

        if (!Pair.first.empty())

          FilledIdentityCnt += Pair.second;

        IdentityCnt += Pair.second;

        combineOrders(IdentityOrder, Pair.first);

      }

    }

    MutableArrayRef<unsigned> BestOrder = IdentityOrder;

    unsigned Cnt = IdentityCnt;

    for (auto &Pair : OrdersUses) {

      // Prefer identity order. But, if filled identity found (non-empty order)

      // with same number of uses, as the new candidate order, we can choose

      // this candidate order.

      if (Cnt < Pair.second ||

          (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&

           Cnt == Pair.second && !BestOrder.empty() &&

           isIdentityOrder(BestOrder))) {

        combineOrders(Pair.first, BestOrder);

        BestOrder = Pair.first;

        Cnt = Pair.second;

      } else {

        combineOrders(BestOrder, Pair.first);

      }

    }

    // Set order of the user node.

    if (isIdentityOrder(BestOrder))

      continue;

    fixupOrderingIndices(BestOrder);

    SmallVector<int> Mask;

    inversePermutation(BestOrder, Mask);

    SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);

    unsigned E = BestOrder.size();

    transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

      return I < E ? static_cast<int>(I) : PoisonMaskElem;

    });

    // Do an actual reordering, if profitable.

    for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

      // Just do the reordering for the nodes with the given VF.

      if (TE->Scalars.size() != VF) {

        if (TE->ReuseShuffleIndices.size() == VF) {

          assert(TE->State != TreeEntry::SplitVectorize &&

                 "Split vectorized not expected.");

          // Need to reorder the reuses masks of the operands with smaller VF to

          // be able to find the match between the graph nodes and scalar

          // operands of the given node during vectorization/cost estimation.

          assert(

              (!TE->UserTreeIndex ||

               TE->UserTreeIndex.UserTE->Scalars.size() == VF ||

               TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||

               TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&

              "All users must be of VF size.");

          if (SLPReVec) {

            assert(SLPReVec && "Only supported by REVEC.");

            // ShuffleVectorInst does not do reorderOperands (and it should not

            // because ShuffleVectorInst supports only a limited set of

            // patterns). Only do reorderNodeWithReuses if the user is not

            // ShuffleVectorInst.

            if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&

                isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))

              continue;

          }

          // Update ordering of the operands with the smaller VF than the given

          // one.

          reorderNodeWithReuses(*TE, Mask);

          // Update orders in user split vectorize nodes.

          if (TE->UserTreeIndex &&

              TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)

            TE->UserTreeIndex.UserTE->reorderSplitNode(

                TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);

        }

        continue;

      }

      if ((TE->State == TreeEntry::SplitVectorize &&

           TE->ReuseShuffleIndices.empty()) ||

          ((TE->State == TreeEntry::Vectorize ||

            TE->State == TreeEntry::StridedVectorize ||

            TE->State == TreeEntry::CompressVectorize) &&

           (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,

                InsertElementInst>(TE->getMainOp()) ||

            (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {

        assert(

            (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&

                                     TE->ReuseShuffleIndices.empty())) &&

            "Alternate instructions are only supported by BinaryOperator "

            "and CastInst.");

        // Build correct orders for extract{element,value}, loads,

        // stores and alternate (split) nodes.

        reorderOrder(TE->ReorderIndices, Mask);

        if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))

          TE->reorderOperands(Mask);

      } else {

        // Reorder the node and its operands.

        TE->reorderOperands(Mask);

        assert(TE->ReorderIndices.empty() &&

               "Expected empty reorder sequence.");

        reorderScalars(TE->Scalars, Mask);

      }

      if (!TE->ReuseShuffleIndices.empty()) {

        // Apply reversed order to keep the original ordering of the reused

        // elements to avoid extra reorder indices shuffling.

        OrdersType CurrentOrder;

        reorderOrder(CurrentOrder, MaskOrder);

        SmallVector<int> NewReuses;

        inversePermutation(CurrentOrder, NewReuses);

        addMask(NewReuses, TE->ReuseShuffleIndices);

        TE->ReuseShuffleIndices.swap(NewReuses);

      } else if (TE->UserTreeIndex &&

                 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)

        // Update orders in user split vectorize nodes.

        TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,

                                                   Mask, MaskOrder);

    }

  }

}


void BoUpSLP::buildReorderableOperands(

    TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

    const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,

    SmallVectorImpl<TreeEntry *> &GatherOps) {

  for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {

    if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {

          return OpData.first == I &&

                 (OpData.second->State == TreeEntry::Vectorize ||

                  OpData.second->State == TreeEntry::StridedVectorize ||

                  OpData.second->State == TreeEntry::CompressVectorize ||

                  OpData.second->State == TreeEntry::SplitVectorize);

        }))

      continue;

    // Do not request operands, if they do not exist.

    if (UserTE->hasState()) {

      if (UserTE->getOpcode() == Instruction::ExtractElement ||

          UserTE->getOpcode() == Instruction::ExtractValue)

        continue;

      if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)

        continue;

      if (UserTE->getOpcode() == Instruction::Store &&

          UserTE->State == TreeEntry::Vectorize && I == 1)

        continue;

      if (UserTE->getOpcode() == Instruction::Load &&

          (UserTE->State == TreeEntry::Vectorize ||

           UserTE->State == TreeEntry::StridedVectorize ||

           UserTE->State == TreeEntry::CompressVectorize))

        continue;

    }

    TreeEntry *TE = getOperandEntry(UserTE, I);

    assert(TE && "Expected operand entry.");

    if (!TE->isGather()) {

      // Add the node to the list of the ordered nodes with the identity

      // order.

      Edges.emplace_back(I, TE);

      // Add ScatterVectorize nodes to the list of operands, where just

      // reordering of the scalars is required. Similar to the gathers, so

      // simply add to the list of gathered ops.

      // If there are reused scalars, process this node as a regular vectorize

      // node, just reorder reuses mask.

      if (TE->State == TreeEntry::ScatterVectorize &&

          TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())

        GatherOps.push_back(TE);

      continue;

    }

    if (ReorderableGathers.contains(TE))

      GatherOps.push_back(TE);

  }

}


void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

  struct TreeEntryCompare {

    bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {

      if (LHS->UserTreeIndex && RHS->UserTreeIndex)

        return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;

      return LHS->Idx < RHS->Idx;

    }

  };

  PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue;

  DenseSet<const TreeEntry *> GathersToOrders;

  // Find all reorderable leaf nodes with the given VF.

  // Currently the are vectorized loads,extracts without alternate operands +

  // some gathering of extracts.

  SmallPtrSet<const TreeEntry *, 4> NonVectorized;

  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

    if (TE->State != TreeEntry::Vectorize &&

        TE->State != TreeEntry::StridedVectorize &&

        TE->State != TreeEntry::CompressVectorize &&

        TE->State != TreeEntry::SplitVectorize)

      NonVectorized.insert(TE.get());

    if (std::optional<OrdersType> CurrentOrder =

            getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {

      Queue.push(TE.get());

      if (!(TE->State == TreeEntry::Vectorize ||

            TE->State == TreeEntry::StridedVectorize ||

            TE->State == TreeEntry::CompressVectorize ||

            TE->State == TreeEntry::SplitVectorize) ||

          !TE->ReuseShuffleIndices.empty())

        GathersToOrders.insert(TE.get());

    }

  }


  // 1. Propagate order to the graph nodes, which use only reordered nodes.

  // I.e., if the node has operands, that are reordered, try to make at least

  // one operand order in the natural order and reorder others + reorder the

  // user node itself.

  SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;

  while (!Queue.empty()) {

    // 1. Filter out only reordered nodes.

    std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;

    TreeEntry *TE = Queue.top();

    const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;

    Queue.pop();

    SmallVector<TreeEntry *> OrderedOps(1, TE);

    while (!Queue.empty()) {

      TE = Queue.top();

      if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)

        break;

      Queue.pop();

      OrderedOps.push_back(TE);

    }

    for (TreeEntry *TE : OrderedOps) {

      if (!(TE->State == TreeEntry::Vectorize ||

            TE->State == TreeEntry::StridedVectorize ||

            TE->State == TreeEntry::CompressVectorize ||

            TE->State == TreeEntry::SplitVectorize ||

            (TE->isGather() && GathersToOrders.contains(TE))) ||

          !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||

          !Visited.insert(TE).second)

        continue;

      // Build a map between user nodes and their operands order to speedup

      // search. The graph currently does not provide this dependency directly.

      Users.first = TE->UserTreeIndex.UserTE;

      Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);

    }

    if (Users.first) {

      auto &Data = Users;

      if (Data.first->State == TreeEntry::SplitVectorize) {

        assert(

            Data.second.size() <= 2 &&

            "Expected not greater than 2 operands for split vectorize node.");

        if (any_of(Data.second,

                   [](const auto &Op) { return !Op.second->UserTreeIndex; }))

          continue;

        // Update orders in user split vectorize nodes.

        assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&

               "Expected exactly 2 entries.");

        for (const auto &P : Data.first->CombinedEntriesWithIndices) {

          TreeEntry &OpTE = *VectorizableTree[P.first];

          OrdersType Order = OpTE.ReorderIndices;

          if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {

            if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())

              continue;

            const auto BestOrder =

                getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);

            if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))

              continue;

            Order = *BestOrder;

          }

          fixupOrderingIndices(Order);

          SmallVector<int> Mask;

          inversePermutation(Order, Mask);

          const unsigned E = Order.size();

          SmallVector<int> MaskOrder(E, PoisonMaskElem);

          transform(Order, MaskOrder.begin(), [E](unsigned I) {

            return I < E ? static_cast<int>(I) : PoisonMaskElem;

          });

          Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);

          // Clear ordering of the operand.

          if (!OpTE.ReorderIndices.empty()) {

            OpTE.ReorderIndices.clear();

          } else if (!OpTE.ReuseShuffleIndices.empty()) {

            reorderReuses(OpTE.ReuseShuffleIndices, Mask);

          } else {

            assert(OpTE.isGather() && "Expected only gather/buildvector node.");

            reorderScalars(OpTE.Scalars, Mask);

          }

        }

        if (Data.first->ReuseShuffleIndices.empty() &&

            !Data.first->ReorderIndices.empty()) {

          // Insert user node to the list to try to sink reordering deeper in

          // the graph.

          Queue.push(Data.first);

        }

        continue;

      }

      // Check that operands are used only in the User node.

      SmallVector<TreeEntry *> GatherOps;

      buildReorderableOperands(Data.first, Data.second, NonVectorized,

                               GatherOps);

      // All operands are reordered and used only in this node - propagate the

      // most used order to the user node.

      MapVector<OrdersType, unsigned,

                DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

          OrdersUses;

      // Do the analysis for each tree entry only once, otherwise the order of

      // the same node my be considered several times, though might be not

      // profitable.

      SmallPtrSet<const TreeEntry *, 4> VisitedOps;

      SmallPtrSet<const TreeEntry *, 4> VisitedUsers;

      for (const auto &Op : Data.second) {

        TreeEntry *OpTE = Op.second;

        if (!VisitedOps.insert(OpTE).second)

          continue;

        if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))

          continue;

        const auto Order = [&]() -> const OrdersType {

          if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())

            return getReorderingData(*OpTE, /*TopToBottom=*/false,

                                     IgnoreReorder)

                .value_or(OrdersType(1));

          return OpTE->ReorderIndices;

        }();

        // The order is partially ordered, skip it in favor of fully non-ordered

        // orders.

        if (Order.size() == 1)

          continue;


        // Check that the reordering does not increase number of shuffles, i.e.

        // same-values-nodes has same parents or their parents has same parents.

        if (!Order.empty() && !isIdentityOrder(Order)) {

          Value *Root = OpTE->hasState()

                            ? OpTE->getMainOp()

                            : *find_if_not(OpTE->Scalars, isConstant);

          auto GetSameNodesUsers = [&](Value *Root) {

            SmallSetVector<TreeEntry *, 4> Res;

            for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {

              if (TE != OpTE && TE->UserTreeIndex &&

                  TE->getVectorFactor() == OpTE->getVectorFactor() &&

                  TE->Scalars.size() == OpTE->Scalars.size() &&

                  ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||

                   (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))

                Res.insert(TE->UserTreeIndex.UserTE);

            }

            for (const TreeEntry *TE : getTreeEntries(Root)) {

              if (TE != OpTE && TE->UserTreeIndex &&

                  TE->getVectorFactor() == OpTE->getVectorFactor() &&

                  TE->Scalars.size() == OpTE->Scalars.size() &&

                  ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||

                   (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))

                Res.insert(TE->UserTreeIndex.UserTE);

            }

            return Res.takeVector();

          };

          auto GetNumOperands = [](const TreeEntry *TE) {

            if (TE->State == TreeEntry::SplitVectorize)

              return TE->getNumOperands();

            if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)

              return CI->arg_size();

            return TE->getNumOperands();

          };

          auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](

                                                       const TreeEntry *TE) {

            Intrinsic::ID ID = Intrinsic::not_intrinsic;

            if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)

              ID = getVectorIntrinsicIDForCall(CI, TLI);

            for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {

              if (ID != Intrinsic::not_intrinsic &&

                  isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))

                continue;

              const TreeEntry *Op = getOperandEntry(TE, Idx);

              if (Op->isGather() && Op->hasState()) {

                const TreeEntry *VecOp =

                    getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);

                if (VecOp)

                  Op = VecOp;

              }

              if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())

                return false;

            }

            return true;

          };

          SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);

          if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {

                if (!RevisitedOps.insert(UTE).second)

                  return false;

                return UTE == Data.first || !UTE->ReorderIndices.empty() ||

                       !UTE->ReuseShuffleIndices.empty() ||

                       (UTE->UserTreeIndex &&

                        UTE->UserTreeIndex.UserTE == Data.first) ||

                       (Data.first->UserTreeIndex &&

                        Data.first->UserTreeIndex.UserTE == UTE) ||

                       (IgnoreReorder && UTE->UserTreeIndex &&

                        UTE->UserTreeIndex.UserTE->Idx == 0) ||

                       NodeShouldBeReorderedWithOperands(UTE);

              }))

            continue;

          for (TreeEntry *UTE : Users) {

            Intrinsic::ID ID = Intrinsic::not_intrinsic;

            if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)

              ID = getVectorIntrinsicIDForCall(CI, TLI);

            for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {

              if (ID != Intrinsic::not_intrinsic &&

                  isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI))

                continue;

              const TreeEntry *Op = getOperandEntry(UTE, Idx);

              Visited.erase(Op);

              Queue.push(const_cast<TreeEntry *>(Op));

            }

          }

        }

        unsigned NumOps = count_if(

            Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {

              return P.second == OpTE;

            });

        // Stores actually store the mask, not the order, need to invert.

        if (OpTE->State == TreeEntry::Vectorize &&

            OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

          assert(!OpTE->isAltShuffle() &&

                 "Alternate instructions are only supported by BinaryOperator "

                 "and CastInst.");

          SmallVector<int> Mask;

          inversePermutation(Order, Mask);

          unsigned E = Order.size();

          OrdersType CurrentOrder(E, E);

          transform(Mask, CurrentOrder.begin(), [E](int Idx) {

            return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

          });

          fixupOrderingIndices(CurrentOrder);

          OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;

        } else {

          OrdersUses.try_emplace(Order, 0).first->second += NumOps;

        }

        auto Res = OrdersUses.try_emplace(OrdersType(), 0);

        const auto AllowsReordering = [&](const TreeEntry *TE) {

          if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

              (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||

              (IgnoreReorder && TE->Idx == 0))

            return true;

          if (TE->isGather()) {

            if (GathersToOrders.contains(TE))

              return !getReorderingData(*TE, /*TopToBottom=*/false,

                                        IgnoreReorder)

                          .value_or(OrdersType(1))

                          .empty();

            return true;

          }

          return false;

        };

        if (OpTE->UserTreeIndex) {

          TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;

          if (!VisitedUsers.insert(UserTE).second)

            continue;

          // May reorder user node if it requires reordering, has reused

          // scalars, is an alternate op vectorize node or its op nodes require

          // reordering.

          if (AllowsReordering(UserTE))

            continue;

          // Check if users allow reordering.

          // Currently look up just 1 level of operands to avoid increase of

          // the compile time.

          // Profitable to reorder if definitely more operands allow

          // reordering rather than those with natural order.

          ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;

          if (static_cast<unsigned>(count_if(

                  Ops, [UserTE, &AllowsReordering](

                           const std::pair<unsigned, TreeEntry *> &Op) {

                    return AllowsReordering(Op.second) &&

                           Op.second->UserTreeIndex.UserTE == UserTE;

                  })) <= Ops.size() / 2)

            ++Res.first->second;

        }

      }

      if (OrdersUses.empty()) {

        Visited.insert_range(llvm::make_second_range(Data.second));

        continue;

      }

      // Choose the most used order.

      unsigned IdentityCnt = 0;

      unsigned VF = Data.second.front().second->getVectorFactor();

      OrdersType IdentityOrder(VF, VF);

      for (auto &Pair : OrdersUses) {

        if (Pair.first.empty() || isIdentityOrder(Pair.first)) {

          IdentityCnt += Pair.second;

          combineOrders(IdentityOrder, Pair.first);

        }

      }

      MutableArrayRef<unsigned> BestOrder = IdentityOrder;

      unsigned Cnt = IdentityCnt;

      for (auto &Pair : OrdersUses) {

        // Prefer identity order. But, if filled identity found (non-empty

        // order) with same number of uses, as the new candidate order, we can

        // choose this candidate order.

        if (Cnt < Pair.second) {

          combineOrders(Pair.first, BestOrder);

          BestOrder = Pair.first;

          Cnt = Pair.second;

        } else {

          combineOrders(BestOrder, Pair.first);

        }

      }

      // Set order of the user node.

      if (isIdentityOrder(BestOrder)) {

        Visited.insert_range(llvm::make_second_range(Data.second));

        continue;

      }

      fixupOrderingIndices(BestOrder);

      // Erase operands from OrderedEntries list and adjust their orders.

      VisitedOps.clear();

      SmallVector<int> Mask;

      inversePermutation(BestOrder, Mask);

      SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);

      unsigned E = BestOrder.size();

      transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

        return I < E ? static_cast<int>(I) : PoisonMaskElem;

      });

      for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {

        TreeEntry *TE = Op.second;

        if (!VisitedOps.insert(TE).second)

          continue;

        if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {

          reorderNodeWithReuses(*TE, Mask);

          continue;

        }

        // Gathers are processed separately.

        if (TE->State != TreeEntry::Vectorize &&

            TE->State != TreeEntry::StridedVectorize &&

            TE->State != TreeEntry::CompressVectorize &&

            TE->State != TreeEntry::SplitVectorize &&

            (TE->State != TreeEntry::ScatterVectorize ||

             TE->ReorderIndices.empty()))

          continue;

        assert((BestOrder.size() == TE->ReorderIndices.size() ||

                TE->ReorderIndices.empty()) &&

               "Non-matching sizes of user/operand entries.");

        reorderOrder(TE->ReorderIndices, Mask);

        if (IgnoreReorder && TE == VectorizableTree.front().get())

          IgnoreReorder = false;

      }

      // For gathers just need to reorder its scalars.

      for (TreeEntry *Gather : GatherOps) {

        assert(Gather->ReorderIndices.empty() &&

               "Unexpected reordering of gathers.");

        if (!Gather->ReuseShuffleIndices.empty()) {

          // Just reorder reuses indices.

          reorderReuses(Gather->ReuseShuffleIndices, Mask);

          continue;

        }

        reorderScalars(Gather->Scalars, Mask);

        Visited.insert(Gather);

      }

      // Reorder operands of the user node and set the ordering for the user

      // node itself.

      auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {

        return TE.isAltShuffle() &&

               (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||

                TE.ReorderIndices.empty());

      };

      if (Data.first->State != TreeEntry::Vectorize ||

          !isa<ExtractElementInst, ExtractValueInst, LoadInst>(

              Data.first->getMainOp()) ||

          IsNotProfitableAltCodeNode(*Data.first))

        Data.first->reorderOperands(Mask);

      if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||

          IsNotProfitableAltCodeNode(*Data.first) ||

          Data.first->State == TreeEntry::StridedVectorize ||

          Data.first->State == TreeEntry::CompressVectorize) {

        reorderScalars(Data.first->Scalars, Mask);

        reorderOrder(Data.first->ReorderIndices, MaskOrder,

                     /*BottomOrder=*/true);

        if (Data.first->ReuseShuffleIndices.empty() &&

            !Data.first->ReorderIndices.empty() &&

            !IsNotProfitableAltCodeNode(*Data.first)) {

          // Insert user node to the list to try to sink reordering deeper in

          // the graph.

          Queue.push(Data.first);

        }

      } else {

        reorderOrder(Data.first->ReorderIndices, Mask);

      }

    }

  }

  // If the reordering is unnecessary, just remove the reorder.

  if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&

      VectorizableTree.front()->ReuseShuffleIndices.empty())

    VectorizableTree.front()->ReorderIndices.clear();

}


Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {

  if (Entry.hasState() &&

      (Entry.getOpcode() == Instruction::Store ||

       Entry.getOpcode() == Instruction::Load) &&

      Entry.State == TreeEntry::StridedVectorize &&

      !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))

    return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);

  return dyn_cast<Instruction>(Entry.Scalars.front());

}


void BoUpSLP::buildExternalUses(

    const ExtraValueToDebugLocsMap &ExternallyUsedValues) {

  const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;

  DenseMap<Value *, unsigned> ScalarToExtUses;

  SmallPtrSet<Value *, 4> ExternalUsers;

  // Collect the values that we need to extract from the tree.

  for (auto &TEPtr : VectorizableTree) {

    TreeEntry *Entry = TEPtr.get();


    // No need to handle users of gathered values.

    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)

      continue;


    // For each lane:

    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

      Value *Scalar = Entry->Scalars[Lane];

      if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))

        continue;


      // All uses must be replaced already? No need to do it again.

      auto It = ScalarToExtUses.find(Scalar);

      if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)

        continue;


      if (Scalar->hasNUsesOrMore(NumVectScalars)) {

        unsigned FoundLane = Entry->findLaneForValue(Scalar);

        LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane

                          << " from " << *Scalar << "for many users.\n");

        It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;

        ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);

        ExternalUsesWithNonUsers.insert(Scalar);

        continue;

      }


      // Check if the scalar is externally used as an extra arg.

      const auto ExtI = ExternallyUsedValues.find(Scalar);

      if (ExtI != ExternallyUsedValues.end()) {

        unsigned FoundLane = Entry->findLaneForValue(Scalar);

        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "

                          << FoundLane << " from " << *Scalar << ".\n");

        ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());

        ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);

        continue;

      }

      for (User *U : Scalar->users()) {

        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");


        Instruction *UserInst = dyn_cast<Instruction>(U);

        if (!UserInst || isDeleted(UserInst))

          continue;


        // Ignore users in the user ignore list.

        if (UserIgnoreList && UserIgnoreList->contains(UserInst))

          continue;


        // Skip in-tree scalars that become vectors

        if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);

            !UseEntries.empty()) {

          // Some in-tree scalars will remain as scalar in vectorized

          // instructions. If that is the case, the one in FoundLane will

          // be used.

          if (!((Scalar->getType()->getScalarType()->isPointerTy() &&

                 isa<LoadInst, StoreInst>(UserInst)) ||

                isa<CallInst>(UserInst)) ||

              all_of(UseEntries, [&](TreeEntry *UseEntry) {

                return UseEntry->State == TreeEntry::ScatterVectorize ||

                       !doesInTreeUserNeedToExtract(

                           Scalar, getRootEntryInstruction(*UseEntry), TLI,

                           TTI);

              })) {

            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U

                              << ".\n");

            assert(none_of(UseEntries,

                           [](TreeEntry *UseEntry) {

                             return UseEntry->isGather();

                           }) &&

                   "Bad state");

            continue;

          }

          U = nullptr;

          if (It != ScalarToExtUses.end()) {

            ExternalUses[It->second].User = nullptr;

            break;

          }

        }


        if (U && Scalar->hasNUsesOrMore(UsesLimit))

          U = nullptr;

        unsigned FoundLane = Entry->findLaneForValue(Scalar);

        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst

                          << " from lane " << FoundLane << " from " << *Scalar

                          << ".\n");

        It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;

        ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);

        ExternalUsesWithNonUsers.insert(Scalar);

        if (!U)

          break;

      }

    }

  }

}


SmallVector<SmallVector<StoreInst *>>

BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {

  SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,

                SmallVector<StoreInst *>, 8>

      PtrToStoresMap;

  for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {

    Value *V = TE->Scalars[Lane];

    // Don't iterate over the users of constant data.

    if (!isa<Instruction>(V))

      continue;

    // To save compilation time we don't visit if we have too many users.

    if (V->hasNUsesOrMore(UsesLimit))

      break;


    // Collect stores per pointer object.

    for (User *U : V->users()) {

      auto *SI = dyn_cast<StoreInst>(U);

      // Test whether we can handle the store. V might be a global, which could

      // be used in a different function.

      if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||

          !isValidElementType(SI->getValueOperand()->getType()))

        continue;

      // Skip entry if already

      if (isVectorized(U))

        continue;


      Value *Ptr =

          getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);

      auto &StoresVec = PtrToStoresMap[{SI->getParent(),

                                        SI->getValueOperand()->getType(), Ptr}];

      // For now just keep one store per pointer object per lane.

      // TODO: Extend this to support multiple stores per pointer per lane

      if (StoresVec.size() > Lane)

        continue;

      if (!StoresVec.empty()) {

        std::optional<int64_t> Diff = getPointersDiff(

            SI->getValueOperand()->getType(), SI->getPointerOperand(),

            SI->getValueOperand()->getType(),

            StoresVec.front()->getPointerOperand(), *DL, *SE,

            /*StrictCheck=*/true);

        // We failed to compare the pointers so just abandon this store.

        if (!Diff)

          continue;

      }

      StoresVec.push_back(SI);

    }

  }

  SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());

  unsigned I = 0;

  for (auto &P : PtrToStoresMap) {

    Res[I].swap(P.second);

    ++I;

  }

  return Res;

}


bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,

                            OrdersType &ReorderIndices) const {

  // We check whether the stores in StoreVec can form a vector by sorting them

  // and checking whether they are consecutive.


  // To avoid calling getPointersDiff() while sorting we create a vector of

  // pairs {store, offset from first} and sort this instead.

  SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec;

  StoreInst *S0 = StoresVec[0];

  StoreOffsetVec.emplace_back(0, 0);

  Type *S0Ty = S0->getValueOperand()->getType();

  Value *S0Ptr = S0->getPointerOperand();

  for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {

    StoreInst *SI = StoresVec[Idx];

    std::optional<int64_t> Diff =

        getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),

                        SI->getPointerOperand(), *DL, *SE,

                        /*StrictCheck=*/true);

    StoreOffsetVec.emplace_back(*Diff, Idx);

  }


  // Check if the stores are consecutive by checking if their difference is 1.

  if (StoreOffsetVec.size() != StoresVec.size())

    return false;

  sort(StoreOffsetVec, llvm::less_first());

  unsigned Idx = 0;

  int64_t PrevDist = 0;

  for (const auto &P : StoreOffsetVec) {

    if (Idx > 0 && P.first != PrevDist + 1)

      return false;

    PrevDist = P.first;

    ++Idx;

  }


  // Calculate the shuffle indices according to their offset against the sorted

  // StoreOffsetVec.

  ReorderIndices.assign(StoresVec.size(), 0);

  bool IsIdentity = true;

  for (auto [I, P] : enumerate(StoreOffsetVec)) {

    ReorderIndices[P.second] = I;

    IsIdentity &= P.second == I;

  }

  // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in

  // reorderTopToBottom() and reorderBottomToTop(), so we are following the

  // same convention here.

  if (IsIdentity)

    ReorderIndices.clear();


  return true;

}


#ifndef NDEBUG

LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {

  for (unsigned Idx : Order)

    dbgs() << Idx << ", ";

  dbgs() << "\n";

}

#endif


SmallVector<BoUpSLP::OrdersType, 1>

BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {

  unsigned NumLanes = TE->Scalars.size();


  SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);


  // Holds the reorder indices for each candidate store vector that is a user of

  // the current TreeEntry.

  SmallVector<OrdersType, 1> ExternalReorderIndices;


  // Now inspect the stores collected per pointer and look for vectorization

  // candidates. For each candidate calculate the reorder index vector and push

  // it into `ExternalReorderIndices`

  for (ArrayRef<StoreInst *> StoresVec : Stores) {

    // If we have fewer than NumLanes stores, then we can't form a vector.

    if (StoresVec.size() != NumLanes)

      continue;


    // If the stores are not consecutive then abandon this StoresVec.

    OrdersType ReorderIndices;

    if (!canFormVector(StoresVec, ReorderIndices))

      continue;


    // We now know that the scalars in StoresVec can form a vector instruction,

    // so set the reorder indices.

    ExternalReorderIndices.push_back(ReorderIndices);

  }

  return ExternalReorderIndices;

}


void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

                        const SmallDenseSet<Value *> &UserIgnoreLst) {

  deleteTree();

  UserIgnoreList = &UserIgnoreLst;

  if (!allSameType(Roots))

    return;

  buildTreeRec(Roots, 0, EdgeInfo());

}


void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {

  deleteTree();

  if (!allSameType(Roots))

    return;

  buildTreeRec(Roots, 0, EdgeInfo());

}


/// Tries to find subvector of loads and builds new vector of only loads if can

/// be profitable.

static void gatherPossiblyVectorizableLoads(

    const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,

    ScalarEvolution &SE, const TargetTransformInfo &TTI,

    SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,

    bool AddNew = true) {

  if (VL.empty())

    return;

  Type *ScalarTy = getValueType(VL.front());

  if (!isValidElementType(ScalarTy))

    return;

  SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads;

  SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;

  for (Value *V : VL) {

    auto *LI = dyn_cast<LoadInst>(V);

    if (!LI)

      continue;

    if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())

      continue;

    bool IsFound = false;

    for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {

      assert(LI->getParent() == Data.front().first->getParent() &&

             LI->getType() == Data.front().first->getType() &&

             getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==

                 getUnderlyingObject(Data.front().first->getPointerOperand(),

                                     RecursionMaxDepth) &&

             "Expected loads with the same type, same parent and same "

             "underlying pointer.");

      std::optional<int64_t> Dist = getPointersDiff(

          LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),

          Data.front().first->getPointerOperand(), DL, SE,

          /*StrictCheck=*/true);

      if (!Dist)

        continue;

      auto It = Map.find(*Dist);

      if (It != Map.end() && It->second != LI)

        continue;

      if (It == Map.end()) {

        Data.emplace_back(LI, *Dist);

        Map.try_emplace(*Dist, LI);

      }

      IsFound = true;

      break;

    }

    if (!IsFound) {

      ClusteredLoads.emplace_back().emplace_back(LI, 0);

      ClusteredDistToLoad.emplace_back().try_emplace(0, LI);

    }

  }

  auto FindMatchingLoads =

      [&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads,

          SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>>

              &GatheredLoads,

          SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,

          int64_t &Offset, unsigned &Start) {

        if (Loads.empty())

          return GatheredLoads.end();

        LoadInst *LI = Loads.front().first;

        for (auto [Idx, Data] : enumerate(GatheredLoads)) {

          if (Idx < Start)

            continue;

          ToAdd.clear();

          if (LI->getParent() != Data.front().first->getParent() ||

              LI->getType() != Data.front().first->getType())

            continue;

          std::optional<int64_t> Dist =

              getPointersDiff(LI->getType(), LI->getPointerOperand(),

                              Data.front().first->getType(),

                              Data.front().first->getPointerOperand(), DL, SE,

                              /*StrictCheck=*/true);

          if (!Dist)

            continue;

          SmallSet<int64_t, 4> DataDists;

          SmallPtrSet<LoadInst *, 4> DataLoads;

          for (std::pair<LoadInst *, int64_t> P : Data) {

            DataDists.insert(P.second);

            DataLoads.insert(P.first);

          }

          // Found matching gathered loads - check if all loads are unique or

          // can be effectively vectorized.

          unsigned NumUniques = 0;

          for (auto [Cnt, Pair] : enumerate(Loads)) {

            bool Used = DataLoads.contains(Pair.first);

            if (!Used && !DataDists.contains(*Dist + Pair.second)) {

              ++NumUniques;

              ToAdd.insert(Cnt);

            } else if (Used) {

              Repeated.insert(Cnt);

            }

          }

          if (NumUniques > 0 &&

              (Loads.size() == NumUniques ||

               (Loads.size() - NumUniques >= 2 &&

                Loads.size() - NumUniques >= Loads.size() / 2 &&

                (has_single_bit(Data.size() + NumUniques) ||

                 bit_ceil(Data.size()) <

                     bit_ceil(Data.size() + NumUniques))))) {

            Offset = *Dist;

            Start = Idx + 1;

            return std::next(GatheredLoads.begin(), Idx);

          }

        }

        ToAdd.clear();

        return GatheredLoads.end();

      };

  for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {

    unsigned Start = 0;

    SetVector<unsigned> ToAdd, LocalToAdd, Repeated;

    int64_t Offset = 0;

    auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,

                                 Offset, Start);

    while (It != GatheredLoads.end()) {

      assert(!LocalToAdd.empty() && "Expected some elements to add.");

      for (unsigned Idx : LocalToAdd)

        It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);

      ToAdd.insert_range(LocalToAdd);

      It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,

                             Start);

    }

    if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {

          return !ToAdd.contains(Idx) && !Repeated.contains(Idx);

        })) {

      auto AddNewLoads =

          [&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) {

            for (unsigned Idx : seq<unsigned>(Data.size())) {

              if (ToAdd.contains(Idx) || Repeated.contains(Idx))

                continue;

              Loads.push_back(Data[Idx]);

            }

          };

      if (!AddNew) {

        LoadInst *LI = Data.front().first;

        It = find_if(

            GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {

              return PD.front().first->getParent() == LI->getParent() &&

                     PD.front().first->getType() == LI->getType();

            });

        while (It != GatheredLoads.end()) {

          AddNewLoads(*It);

          It = std::find_if(

              std::next(It), GatheredLoads.end(),

              [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {

                return PD.front().first->getParent() == LI->getParent() &&

                       PD.front().first->getType() == LI->getType();

              });

        }

      }

      GatheredLoads.emplace_back().append(Data.begin(), Data.end());

      AddNewLoads(GatheredLoads.emplace_back());

    }

  }

}


void BoUpSLP::tryToVectorizeGatheredLoads(

    const SmallMapVector<

        std::tuple<BasicBlock *, Value *, Type *>,

        SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>

        &GatheredLoads) {

  GatheredLoadsEntriesFirst = VectorizableTree.size();


  SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(

      LoadEntriesToVectorize.size());

  for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))

    Set.insert_range(VectorizableTree[Idx]->Scalars);


  // Sort loads by distance.

  auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,

                       const std::pair<LoadInst *, int64_t> &L2) {

    return L1.second > L2.second;

  };


  auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {

    ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),

                             Loads.size());

    Align Alignment = computeCommonAlignment<LoadInst>(Values);

    auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());

    return TTI->isLegalMaskedGather(Ty, Alignment) &&

           !TTI->forceScalarizeMaskedGather(Ty, Alignment);

  };


  auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,

                                    BoUpSLP::ValueSet &VectorizedLoads,

                                    SmallVectorImpl<LoadInst *> &NonVectorized,

                                    bool Final, unsigned MaxVF) {

    SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;

    unsigned StartIdx = 0;

    SmallVector<int> CandidateVFs;

    if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))

      CandidateVFs.push_back(MaxVF);

    for (int NumElts = getFloorFullVectorNumberOfElements(

             *TTI, Loads.front()->getType(), MaxVF);

         NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(

                          *TTI, Loads.front()->getType(), NumElts - 1)) {

      CandidateVFs.push_back(NumElts);

      if (VectorizeNonPowerOf2 && NumElts > 2)

        CandidateVFs.push_back(NumElts - 1);

    }


    if (Final && CandidateVFs.empty())

      return Results;


    unsigned BestVF = Final ? CandidateVFs.back() : 0;

    for (unsigned NumElts : CandidateVFs) {

      if (Final && NumElts > BestVF)

        continue;

      SmallVector<unsigned> MaskedGatherVectorized;

      for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;

           ++Cnt) {

        ArrayRef<LoadInst *> Slice =

            ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));

        if (VectorizedLoads.count(Slice.front()) ||

            VectorizedLoads.count(Slice.back()) ||

            areKnownNonVectorizableLoads(Slice))

          continue;

        // Check if it is profitable to try vectorizing gathered loads. It is

        // profitable if we have more than 3 consecutive loads or if we have

        // less but all users are vectorized or deleted.

        bool AllowToVectorize = false;

        // Check if it is profitable to vectorize 2-elements loads.

        if (NumElts == 2) {

          bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(

              Slice.front()->getType(), ElementCount::getFixed(NumElts));

          auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {

            for (LoadInst *LI : Slice) {

              // If single use/user - allow to vectorize.

              if (LI->hasOneUse())

                continue;

              // 1. Check if number of uses equals number of users.

              // 2. All users are deleted.

              // 3. The load broadcasts are not allowed or the load is not

              // broadcasted.

              if (static_cast<unsigned int>(std::distance(

                      LI->user_begin(), LI->user_end())) != LI->getNumUses())

                return false;

              if (!IsLegalBroadcastLoad)

                continue;

              if (LI->hasNUsesOrMore(UsesLimit))

                return false;

              for (User *U : LI->users()) {

                if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))

                  continue;

                for (const TreeEntry *UTE : getTreeEntries(U)) {

                  for (int I : seq<int>(UTE->getNumOperands())) {

                    if (all_of(UTE->getOperand(I), [LI](Value *V) {

                          return V == LI || isa<PoisonValue>(V);

                        }))

                      // Found legal broadcast - do not vectorize.

                      return false;

                  }

                }

              }

            }

            return true;

          };

          AllowToVectorize = CheckIfAllowed(Slice);

        } else {

          AllowToVectorize =

              (NumElts >= 3 ||

               any_of(ValueToGatherNodes.at(Slice.front()),

                      [=](const TreeEntry *TE) {

                        return TE->Scalars.size() == 2 &&

                               ((TE->Scalars.front() == Slice.front() &&

                                 TE->Scalars.back() == Slice.back()) ||

                                (TE->Scalars.front() == Slice.back() &&

                                 TE->Scalars.back() == Slice.front()));

                      })) &&

              hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),

                                       Slice.size());

        }

        if (AllowToVectorize) {

          SmallVector<Value *> PointerOps;

          OrdersType CurrentOrder;

          // Try to build vector load.

          ArrayRef<Value *> Values(

              reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());

          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,

                                            PointerOps, &BestVF);

          if (LS != LoadsState::Gather ||

              (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {

            if (LS == LoadsState::ScatterVectorize) {

              if (MaskedGatherVectorized.empty() ||

                  Cnt >= MaskedGatherVectorized.back() + NumElts)

                MaskedGatherVectorized.push_back(Cnt);

              continue;

            }

            if (LS != LoadsState::Gather) {

              Results.emplace_back(Values, LS);

              VectorizedLoads.insert_range(Slice);

              // If we vectorized initial block, no need to try to vectorize it

              // again.

              if (Cnt == StartIdx)

                StartIdx += NumElts;

            }

            // Check if the whole array was vectorized already - exit.

            if (StartIdx >= Loads.size())

              break;

            // Erase last masked gather candidate, if another candidate within

            // the range is found to be better.

            if (!MaskedGatherVectorized.empty() &&

                Cnt < MaskedGatherVectorized.back() + NumElts)

              MaskedGatherVectorized.pop_back();

            Cnt += NumElts - 1;

            continue;

          }

        }

        if (!AllowToVectorize || BestVF == 0)

          registerNonVectorizableLoads(Slice);

      }

      // Mark masked gathers candidates as vectorized, if any.

      for (unsigned Cnt : MaskedGatherVectorized) {

        ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(

            Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));

        ArrayRef<Value *> Values(

            reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());

        Results.emplace_back(Values, LoadsState::ScatterVectorize);

        VectorizedLoads.insert_range(Slice);

        // If we vectorized initial block, no need to try to vectorize it again.

        if (Cnt == StartIdx)

          StartIdx += NumElts;

      }

    }

    for (LoadInst *LI : Loads) {

      if (!VectorizedLoads.contains(LI))

        NonVectorized.push_back(LI);

    }

    return Results;

  };

  auto ProcessGatheredLoads =

      [&, &TTI = *TTI](

          ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,

          bool Final = false) {

        SmallVector<LoadInst *> NonVectorized;

        for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :

             GatheredLoads) {

          if (LoadsDists.size() <= 1) {

            NonVectorized.push_back(LoadsDists.back().first);

            continue;

          }

          SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(

              LoadsDists);

          SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));

          stable_sort(LocalLoadsDists, LoadSorter);

          SmallVector<LoadInst *> Loads;

          unsigned MaxConsecutiveDistance = 0;

          unsigned CurrentConsecutiveDist = 1;

          int64_t LastDist = LocalLoadsDists.front().second;

          bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);

          for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {

            if (isVectorized(L.first))

              continue;

            assert(LastDist >= L.second &&

                   "Expected first distance always not less than second");

            if (static_cast<uint64_t>(LastDist - L.second) ==

                CurrentConsecutiveDist) {

              ++CurrentConsecutiveDist;

              MaxConsecutiveDistance =

                  std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);

              Loads.push_back(L.first);

              continue;

            }

            if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&

                !Loads.empty())

              Loads.pop_back();

            CurrentConsecutiveDist = 1;

            LastDist = L.second;

            Loads.push_back(L.first);

          }

          if (Loads.size() <= 1)

            continue;

          if (AllowMaskedGather)

            MaxConsecutiveDistance = Loads.size();

          else if (MaxConsecutiveDistance < 2)

            continue;

          BoUpSLP::ValueSet VectorizedLoads;

          SmallVector<LoadInst *> SortedNonVectorized;

          SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =

              GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,

                                  Final, MaxConsecutiveDistance);

          if (!Results.empty() && !SortedNonVectorized.empty() &&

              OriginalLoads.size() == Loads.size() &&

              MaxConsecutiveDistance == Loads.size() &&

              all_of(Results,

                     [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {

                       return P.second == LoadsState::ScatterVectorize;

                     })) {

            VectorizedLoads.clear();

            SmallVector<LoadInst *> UnsortedNonVectorized;

            SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>

                UnsortedResults =

                    GetVectorizedRanges(OriginalLoads, VectorizedLoads,

                                        UnsortedNonVectorized, Final,

                                        OriginalLoads.size());

            if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {

              SortedNonVectorized.swap(UnsortedNonVectorized);

              Results.swap(UnsortedResults);

            }

          }

          for (auto [Slice, _] : Results) {

            LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("

                              << Slice.size() << ")\n");

            if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {

              for (Value *L : Slice)

                if (!isVectorized(L))

                  SortedNonVectorized.push_back(cast<LoadInst>(L));

              continue;

            }


            // Select maximum VF as a maximum of user gathered nodes and

            // distance between scalar loads in these nodes.

            unsigned MaxVF = Slice.size();

            unsigned UserMaxVF = 0;

            unsigned InterleaveFactor = 0;

            if (MaxVF == 2) {

              UserMaxVF = MaxVF;

            } else {

              // Found distance between segments of the interleaved loads.

              std::optional<unsigned> InterleavedLoadsDistance = 0;

              unsigned Order = 0;

              std::optional<unsigned> CommonVF = 0;

              DenseMap<const TreeEntry *, unsigned> EntryToPosition;

              SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;

              for (auto [Idx, V] : enumerate(Slice)) {

                for (const TreeEntry *E : ValueToGatherNodes.at(V)) {

                  UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());

                  unsigned Pos =

                      EntryToPosition.try_emplace(E, Idx).first->second;

                  UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);

                  if (CommonVF) {

                    if (*CommonVF == 0) {

                      CommonVF = E->Scalars.size();

                      continue;

                    }

                    if (*CommonVF != E->Scalars.size())

                      CommonVF.reset();

                  }

                  // Check if the load is the part of the interleaved load.

                  if (Pos != Idx && InterleavedLoadsDistance) {

                    if (!DeinterleavedNodes.contains(E) &&

                        any_of(E->Scalars, [&, Slice = Slice](Value *V) {

                          if (isa<Constant>(V))

                            return false;

                          if (isVectorized(V))

                            return true;

                          const auto &Nodes = ValueToGatherNodes.at(V);

                          return (Nodes.size() != 1 || !Nodes.contains(E)) &&

                                 !is_contained(Slice, V);

                        })) {

                      InterleavedLoadsDistance.reset();

                      continue;

                    }

                    DeinterleavedNodes.insert(E);

                    if (*InterleavedLoadsDistance == 0) {

                      InterleavedLoadsDistance = Idx - Pos;

                      continue;

                    }

                    if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||

                        (Idx - Pos) / *InterleavedLoadsDistance < Order)

                      InterleavedLoadsDistance.reset();

                    Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);

                  }

                }

              }

              DeinterleavedNodes.clear();

              // Check if the large load represents interleaved load operation.

              if (InterleavedLoadsDistance.value_or(0) > 1 &&

                  CommonVF.value_or(0) != 0) {

                InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);

                unsigned VF = *CommonVF;

                OrdersType Order;

                SmallVector<Value *> PointerOps;

                // Segmented load detected - vectorize at maximum vector factor.

                if (InterleaveFactor <= Slice.size() &&

                    TTI.isLegalInterleavedAccessType(

                        getWidenedType(Slice.front()->getType(), VF),

                        InterleaveFactor,

                        cast<LoadInst>(Slice.front())->getAlign(),

                        cast<LoadInst>(Slice.front())

                            ->getPointerAddressSpace()) &&

                    canVectorizeLoads(Slice, Slice.front(), Order,

                                      PointerOps) == LoadsState::Vectorize) {

                  UserMaxVF = InterleaveFactor * VF;

                } else {

                  InterleaveFactor = 0;

                }

              }

              // Cannot represent the loads as consecutive vectorizable nodes -

              // just exit.

              unsigned ConsecutiveNodesSize = 0;

              if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&

                  any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),

                         [&, Slice = Slice](const auto &P) {

                           const auto *It = find_if(Slice, [&](Value *V) {

                             return std::get<1>(P).contains(V);

                           });

                           if (It == Slice.end())

                             return false;

                           const TreeEntry &TE =

                               *VectorizableTree[std::get<0>(P)];

                           ArrayRef<Value *> VL = TE.Scalars;

                           OrdersType Order;

                           SmallVector<Value *> PointerOps;

                           LoadsState State = canVectorizeLoads(

                               VL, VL.front(), Order, PointerOps);

                           if (State == LoadsState::ScatterVectorize ||

                               State == LoadsState::CompressVectorize)

                             return false;

                           ConsecutiveNodesSize += VL.size();

                           size_t Start = std::distance(Slice.begin(), It);

                           size_t Sz = Slice.size() - Start;

                           return Sz < VL.size() ||

                                  Slice.slice(Start, VL.size()) != VL;

                         }))

                continue;

              // Try to build long masked gather loads.

              UserMaxVF = bit_ceil(UserMaxVF);

              if (InterleaveFactor == 0 &&

                  any_of(seq<unsigned>(Slice.size() / UserMaxVF),

                         [&, Slice = Slice](unsigned Idx) {

                           OrdersType Order;

                           SmallVector<Value *> PointerOps;

                           return canVectorizeLoads(

                                      Slice.slice(Idx * UserMaxVF, UserMaxVF),

                                      Slice[Idx * UserMaxVF], Order,

                                      PointerOps) ==

                                  LoadsState::ScatterVectorize;

                         }))

                UserMaxVF = MaxVF;

              if (Slice.size() != ConsecutiveNodesSize)

                MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);

            }

            for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {

              bool IsVectorized = true;

              for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {

                ArrayRef<Value *> SubSlice =

                    Slice.slice(I, std::min(VF, E - I));

                if (isVectorized(SubSlice.front()))

                  continue;

                // Check if the subslice is to be-vectorized entry, which is not

                // equal to entry.

                if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),

                           [&](const auto &P) {

                             return !SubSlice.equals(

                                        VectorizableTree[std::get<0>(P)]

                                            ->Scalars) &&

                                    set_is_subset(SubSlice, std::get<1>(P));

                           }))

                  continue;

                unsigned Sz = VectorizableTree.size();

                buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);

                if (Sz == VectorizableTree.size()) {

                  IsVectorized = false;

                  // Try non-interleaved vectorization with smaller vector

                  // factor.

                  if (InterleaveFactor > 0) {

                    VF = 2 * (MaxVF / InterleaveFactor);

                    InterleaveFactor = 0;

                  }

                  continue;

                }

              }

              if (IsVectorized)

                break;

            }

          }

          NonVectorized.append(SortedNonVectorized);

        }

        return NonVectorized;

      };

  for (const auto &GLs : GatheredLoads) {

    const auto &Ref = GLs.second;

    SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);

    if (!Ref.empty() && !NonVectorized.empty() &&

        std::accumulate(

            Ref.begin(), Ref.end(), 0u,

            [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)

                -> unsigned { return S + LoadsDists.size(); }) !=

            NonVectorized.size() &&

        IsMaskedGatherSupported(NonVectorized)) {

      SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>

          FinalGatheredLoads;

      for (LoadInst *LI : NonVectorized) {

        // Reinsert non-vectorized loads to other list of loads with the same

        // base pointers.

        gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,

                                        FinalGatheredLoads,

                                        /*AddNew=*/false);

      }

      // Final attempt to vectorize non-vectorized loads.

      (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);

    }

  }

  // Try to vectorize postponed load entries, previously marked as gathered.

  for (unsigned Idx : LoadEntriesToVectorize) {

    const TreeEntry &E = *VectorizableTree[Idx];

    SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());

    // Avoid reordering, if possible.

    if (!E.ReorderIndices.empty()) {

      // Build a mask out of the reorder indices and reorder scalars per this

      // mask.

      SmallVector<int> ReorderMask;

      inversePermutation(E.ReorderIndices, ReorderMask);

      reorderScalars(GatheredScalars, ReorderMask);

    }

    buildTreeRec(GatheredScalars, 0, EdgeInfo());

  }

  // If no new entries created, consider it as no gathered loads entries must be

  // handled.

  if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==

      VectorizableTree.size())

    GatheredLoadsEntriesFirst.reset();

}


/// Generates key/subkey pair for the given value to provide effective sorting

/// of the values and better detection of the vectorizable values sequences. The

/// keys/subkeys can be used for better sorting of the values themselves (keys)

/// and in values subgroups (subkeys).

static std::pair<size_t, size_t> generateKeySubkey(

    Value *V, const TargetLibraryInfo *TLI,

    function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,

    bool AllowAlternate) {

  hash_code Key = hash_value(V->getValueID() + 2);

  hash_code SubKey = hash_value(0);

  // Sort the loads by the distance between the pointers.

  if (auto *LI = dyn_cast<LoadInst>(V)) {

    Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);

    if (LI->isSimple())

      SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));

    else

      Key = SubKey = hash_value(LI);

  } else if (isVectorLikeInstWithConstOps(V)) {

    // Sort extracts by the vector operands.

    if (isa<ExtractElementInst, UndefValue>(V))

      Key = hash_value(Value::UndefValueVal + 1);

    if (auto *EI = dyn_cast<ExtractElementInst>(V)) {

      if (!isUndefVector(EI->getVectorOperand()).all() &&

          !isa<UndefValue>(EI->getIndexOperand()))

        SubKey = hash_value(EI->getVectorOperand());

    }

  } else if (auto *I = dyn_cast<Instruction>(V)) {

    // Sort other instructions just by the opcodes except for CMPInst.

    // For CMP also sort by the predicate kind.

    if ((isa<BinaryOperator, CastInst>(I)) &&

        isValidForAlternation(I->getOpcode())) {

      if (AllowAlternate)

        Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);

      else

        Key = hash_combine(hash_value(I->getOpcode()), Key);

      SubKey = hash_combine(

          hash_value(I->getOpcode()), hash_value(I->getType()),

          hash_value(isa<BinaryOperator>(I)

                         ? I->getType()

                         : cast<CastInst>(I)->getOperand(0)->getType()));

      // For casts, look through the only operand to improve compile time.

      if (isa<CastInst>(I)) {

        std::pair<size_t, size_t> OpVals =

            generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,

                              /*AllowAlternate=*/true);

        Key = hash_combine(OpVals.first, Key);

        SubKey = hash_combine(OpVals.first, SubKey);

      }

    } else if (auto *CI = dyn_cast<CmpInst>(I)) {

      CmpInst::Predicate Pred = CI->getPredicate();

      if (CI->isCommutative())

        Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));

      CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);

      SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),

                            hash_value(SwapPred),

                            hash_value(CI->getOperand(0)->getType()));

    } else if (auto *Call = dyn_cast<CallInst>(I)) {

      Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);

      if (isTriviallyVectorizable(ID)) {

        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));

      } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {

        SubKey = hash_combine(hash_value(I->getOpcode()),

                              hash_value(Call->getCalledFunction()));

      } else {

        Key = hash_combine(hash_value(Call), Key);

        SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));

      }

      for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())

        SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),

                              hash_value(Op.Tag), SubKey);

    } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

      if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))

        SubKey = hash_value(Gep->getPointerOperand());

      else

        SubKey = hash_value(Gep);

    } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&

               !isa<ConstantInt>(I->getOperand(1))) {

      // Do not try to vectorize instructions with potentially high cost.

      SubKey = hash_value(I);

    } else {

      SubKey = hash_value(I->getOpcode());

    }

    Key = hash_combine(hash_value(I->getParent()), Key);

  }

  return std::make_pair(Key, SubKey);

}


/// Checks if the specified instruction \p I is an main operation for the given

/// \p MainOp and \p AltOp instructions.

static bool isMainInstruction(Instruction *I, Instruction *MainOp,

                              Instruction *AltOp, const TargetLibraryInfo &TLI);


bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,

                                       ArrayRef<Value *> VL) const {

  Type *ScalarTy = S.getMainOp()->getType();

  unsigned Opcode0 = S.getOpcode();

  unsigned Opcode1 = S.getAltOpcode();

  SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));

  // If this pattern is supported by the target then consider it profitable.

  if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,

                           Opcode1, OpcodeMask))

    return true;

  SmallVector<ValueList> Operands;

  for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {

    Operands.emplace_back();

    // Prepare the operand vector.

    for (Value *V : VL) {

      if (isa<PoisonValue>(V)) {

        Operands.back().push_back(

            PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));

        continue;

      }

      Operands.back().push_back(cast<Instruction>(V)->getOperand(I));

    }

  }

  if (Operands.size() == 2) {

    // Try find best operands candidates.

    for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {

      SmallVector<std::pair<Value *, Value *>> Candidates(3);

      Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);

      Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);

      Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);

      std::optional<int> Res = findBestRootPair(Candidates);

      switch (Res.value_or(0)) {

      case 0:

        break;

      case 1:

        std::swap(Operands[0][I + 1], Operands[1][I + 1]);

        break;

      case 2:

        std::swap(Operands[0][I], Operands[1][I]);

        break;

      default:

        llvm_unreachable("Unexpected index.");

      }

    }

  }

  DenseSet<unsigned> UniqueOpcodes;

  constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.

  unsigned NonInstCnt = 0;

  // Estimate number of instructions, required for the vectorized node and for

  // the buildvector node.

  unsigned UndefCnt = 0;

  // Count the number of extra shuffles, required for vector nodes.

  unsigned ExtraShuffleInsts = 0;

  // Check that operands do not contain same values and create either perfect

  // diamond match or shuffled match.

  if (Operands.size() == 2) {

    // Do not count same operands twice.

    if (Operands.front() == Operands.back()) {

      Operands.erase(Operands.begin());

    } else if (!allConstant(Operands.front()) &&

               all_of(Operands.front(), [&](Value *V) {

                 return is_contained(Operands.back(), V);

               })) {

      Operands.erase(Operands.begin());

      ++ExtraShuffleInsts;

    }

  }

  const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());

  // Vectorize node, if:

  // 1. at least single operand is constant or splat.

  // 2. Operands have many loop invariants (the instructions are not loop

  // invariants).

  // 3. At least single unique operands is supposed to vectorized.

  return none_of(Operands,

                 [&](ArrayRef<Value *> Op) {

                   if (allConstant(Op) ||

                       (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&

                        getSameOpcode(Op, *TLI)))

                     return false;

                   DenseMap<Value *, unsigned> Uniques;

                   for (Value *V : Op) {

                     if (isa<Constant, ExtractElementInst>(V) ||

                         isVectorized(V) || (L && L->isLoopInvariant(V))) {

                       if (isa<UndefValue>(V))

                         ++UndefCnt;

                       continue;

                     }

                     auto Res = Uniques.try_emplace(V, 0);

                     // Found first duplicate - need to add shuffle.

                     if (!Res.second && Res.first->second == 1)

                       ++ExtraShuffleInsts;

                     ++Res.first->getSecond();

                     if (auto *I = dyn_cast<Instruction>(V))

                       UniqueOpcodes.insert(I->getOpcode());

                     else if (Res.second)

                       ++NonInstCnt;

                   }

                   return none_of(Uniques, [&](const auto &P) {

                     return P.first->hasNUsesOrMore(P.second + 1) &&

                            none_of(P.first->users(), [&](User *U) {

                              return isVectorized(U) || Uniques.contains(U);

                            });

                   });

                 }) ||

         // Do not vectorize node, if estimated number of vector instructions is

         // more than estimated number of buildvector instructions. Number of

         // vector operands is number of vector instructions + number of vector

         // instructions for operands (buildvectors). Number of buildvector

         // instructions is just number_of_operands * number_of_scalars.

         (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&

          (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +

           NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());

}


/// Builds the arguments types vector for the given call instruction with the

/// given \p ID for the specified vector factor.

static SmallVector<Type *>

buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,

                       const unsigned VF, unsigned MinBW,

                       const TargetTransformInfo *TTI) {

  SmallVector<Type *> ArgTys;

  for (auto [Idx, Arg] : enumerate(CI->args())) {

    if (ID != Intrinsic::not_intrinsic) {

      if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {

        ArgTys.push_back(Arg->getType());

        continue;

      }

      if (MinBW > 0) {

        ArgTys.push_back(

            getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));

        continue;

      }

    }

    ArgTys.push_back(getWidenedType(Arg->getType(), VF));

  }

  return ArgTys;

}


/// Calculates the costs of vectorized intrinsic (if possible) and vectorized

/// function (if possible) calls. Returns invalid cost for the corresponding

/// calls, if they cannot be vectorized/will be scalarized.

static std::pair<InstructionCost, InstructionCost>

getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI,

                   ArrayRef<Type *> ArgTys) {

  auto Shape = VFShape::get(CI->getFunctionType(),

                            ElementCount::getFixed(VecTy->getNumElements()),

                            false /*HasGlobalPred*/);

  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

  auto LibCost = InstructionCost::getInvalid();

  if (!CI->isNoBuiltin() && VecFunc) {

    // Calculate the cost of the vector library call.

    // If the corresponding vector call is cheaper, return its cost.

    LibCost =

        TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);

  }

  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);


  // Calculate the cost of the vector intrinsic call.

  FastMathFlags FMF;

  if (auto *FPCI = dyn_cast<FPMathOperator>(CI))

    FMF = FPCI->getFastMathFlags();

  const InstructionCost ScalarLimit = 10000;

  IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,

                                    LibCost.isValid() ? LibCost : ScalarLimit);

  auto IntrinsicCost =

      TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);

  if ((LibCost.isValid() && IntrinsicCost > LibCost) ||

      (!LibCost.isValid() && IntrinsicCost > ScalarLimit))

    IntrinsicCost = InstructionCost::getInvalid();


  return {IntrinsicCost, LibCost};

}


BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

    const InstructionsState &S, ArrayRef<Value *> VL,

    bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,

    SmallVectorImpl<Value *> &PointerOps) {

  assert(S.getMainOp() &&

         "Expected instructions with same/alternate opcodes only.");


  unsigned ShuffleOrOp =

      S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

  Instruction *VL0 = S.getMainOp();

  switch (ShuffleOrOp) {

  case Instruction::PHI: {

    // Too many operands - gather, most probably won't be vectorized.

    if (VL0->getNumOperands() > MaxPHINumOperands)

      return TreeEntry::NeedToGather;

    // Check for terminator values (e.g. invoke).

    for (Value *V : VL) {

      auto *PHI = dyn_cast<PHINode>(V);

      if (!PHI)

        continue;

      for (Value *Incoming : PHI->incoming_values()) {

        Instruction *Term = dyn_cast<Instruction>(Incoming);

        if (Term && Term->isTerminator()) {

          LLVM_DEBUG(dbgs()

                     << "SLP: Need to swizzle PHINodes (terminator use).\n");

          return TreeEntry::NeedToGather;

        }

      }

    }


    return TreeEntry::Vectorize;

  }

  case Instruction::ExtractElement:

    if (any_of(VL, [&](Value *V) {

          auto *EI = dyn_cast<ExtractElementInst>(V);

          if (!EI)

            return true;

          return isVectorized(EI->getOperand(0));

        }))

      return TreeEntry::NeedToGather;

    [[fallthrough]];

  case Instruction::ExtractValue: {

    bool Reuse = canReuseExtract(VL, CurrentOrder);

    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and

    // non-full registers).

    if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))

      return TreeEntry::NeedToGather;

    if (Reuse || !CurrentOrder.empty())

      return TreeEntry::Vectorize;

    LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");

    return TreeEntry::NeedToGather;

  }

  case Instruction::InsertElement: {

    // Check that we have a buildvector and not a shuffle of 2 or more

    // different vectors.

    ValueSet SourceVectors;

    for (Value *V : VL) {

      if (isa<PoisonValue>(V)) {

        LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");

        return TreeEntry::NeedToGather;

      }

      SourceVectors.insert(cast<Instruction>(V)->getOperand(0));

      assert(getElementIndex(V) != std::nullopt &&

             "Non-constant or undef index?");

    }


    if (count_if(VL, [&SourceVectors](Value *V) {

          return !SourceVectors.contains(V);

        }) >= 2) {

      // Found 2nd source vector - cancel.

      LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "

                           "different source vectors.\n");

      return TreeEntry::NeedToGather;

    }


    if (any_of(VL, [&SourceVectors](Value *V) {

          // The last InsertElement can have multiple uses.

          return SourceVectors.contains(V) && !V->hasOneUse();

        })) {

      assert(SLPReVec && "Only supported by REVEC.");

      LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "

                           "multiple uses.\n");

      return TreeEntry::NeedToGather;

    }


    return TreeEntry::Vectorize;

  }

  case Instruction::Load: {

    // Check that a vectorized load would load the same memory as a scalar

    // load. For example, we don't want to vectorize loads that are smaller

    // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

    // treats loading/storing it as an i8 struct. If we vectorize loads/stores

    // from such a struct, we read/write packed bits disagreeing with the

    // unvectorized version.

    auto IsGatheredNode = [&]() {

      if (!GatheredLoadsEntriesFirst)

        return false;

      return all_of(VL, [&](Value *V) {

        if (isa<PoisonValue>(V))

          return true;

        return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {

          return TE->Idx >= *GatheredLoadsEntriesFirst;

        });

      });

    };

    switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {

    case LoadsState::Vectorize:

      return TreeEntry::Vectorize;

    case LoadsState::CompressVectorize:

      if (!IsGraphTransformMode && !VectorizableTree.empty()) {

        // Delay slow vectorized nodes for better vectorization attempts.

        LoadEntriesToVectorize.insert(VectorizableTree.size());

        return TreeEntry::NeedToGather;

      }

      return IsGatheredNode() ? TreeEntry::NeedToGather

                              : TreeEntry::CompressVectorize;

    case LoadsState::ScatterVectorize:

      if (!IsGraphTransformMode && !VectorizableTree.empty()) {

        // Delay slow vectorized nodes for better vectorization attempts.

        LoadEntriesToVectorize.insert(VectorizableTree.size());

        return TreeEntry::NeedToGather;

      }

      return IsGatheredNode() ? TreeEntry::NeedToGather

                              : TreeEntry::ScatterVectorize;

    case LoadsState::StridedVectorize:

      if (!IsGraphTransformMode && VectorizableTree.size() > 1) {

        // Delay slow vectorized nodes for better vectorization attempts.

        LoadEntriesToVectorize.insert(VectorizableTree.size());

        return TreeEntry::NeedToGather;

      }

      return IsGatheredNode() ? TreeEntry::NeedToGather

                              : TreeEntry::StridedVectorize;

    case LoadsState::Gather:

#ifndef NDEBUG

      Type *ScalarTy = VL0->getType();

      if (DL->getTypeSizeInBits(ScalarTy) !=

          DL->getTypeAllocSizeInBits(ScalarTy))

        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");

      else if (any_of(VL, [](Value *V) {

                 auto *LI = dyn_cast<LoadInst>(V);

                 return !LI || !LI->isSimple();

               }))

        LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");

      else

        LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");

#endif // NDEBUG

      registerNonVectorizableLoads(VL);

      return TreeEntry::NeedToGather;

    }

    llvm_unreachable("Unexpected state of loads");

  }

  case Instruction::ZExt:

  case Instruction::SExt:

  case Instruction::FPToUI:

  case Instruction::FPToSI:

  case Instruction::FPExt:

  case Instruction::PtrToInt:

  case Instruction::IntToPtr:

  case Instruction::SIToFP:

  case Instruction::UIToFP:

  case Instruction::Trunc:

  case Instruction::FPTrunc:

  case Instruction::BitCast: {

    Type *SrcTy = VL0->getOperand(0)->getType();

    for (Value *V : VL) {

      if (isa<PoisonValue>(V))

        continue;

      Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();

      if (Ty != SrcTy || !isValidElementType(Ty)) {

        LLVM_DEBUG(

            dbgs() << "SLP: Gathering casts with different src types.\n");

        return TreeEntry::NeedToGather;

      }

    }

    return TreeEntry::Vectorize;

  }

  case Instruction::ICmp:

  case Instruction::FCmp: {

    // Check that all of the compares have the same predicate.

    CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

    CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);

    Type *ComparedTy = VL0->getOperand(0)->getType();

    for (Value *V : VL) {

      if (isa<PoisonValue>(V))

        continue;

      auto *Cmp = cast<CmpInst>(V);

      if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||

          Cmp->getOperand(0)->getType() != ComparedTy) {

        LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");

        return TreeEntry::NeedToGather;

      }

    }

    return TreeEntry::Vectorize;

  }

  case Instruction::Select:

  case Instruction::FNeg:

  case Instruction::Add:

  case Instruction::FAdd:

  case Instruction::Sub:

  case Instruction::FSub:

  case Instruction::Mul:

  case Instruction::FMul:

  case Instruction::UDiv:

  case Instruction::SDiv:

  case Instruction::FDiv:

  case Instruction::URem:

  case Instruction::SRem:

  case Instruction::FRem:

  case Instruction::Shl:

  case Instruction::LShr:

  case Instruction::AShr:

  case Instruction::And:

  case Instruction::Or:

  case Instruction::Xor:

  case Instruction::Freeze:

    if (S.getMainOp()->getType()->isFloatingPointTy() &&

        TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {

          auto *I = dyn_cast<Instruction>(V);

          return I && I->isBinaryOp() && !I->isFast();

        }))

      return TreeEntry::NeedToGather;

    return TreeEntry::Vectorize;

  case Instruction::GetElementPtr: {

    // We don't combine GEPs with complicated (nested) indexing.

    for (Value *V : VL) {

      auto *I = dyn_cast<GetElementPtrInst>(V);

      if (!I)

        continue;

      if (I->getNumOperands() != 2) {

        LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");

        return TreeEntry::NeedToGather;

      }

    }


    // We can't combine several GEPs into one vector if they operate on

    // different types.

    Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();

    for (Value *V : VL) {

      auto *GEP = dyn_cast<GEPOperator>(V);

      if (!GEP)

        continue;

      Type *CurTy = GEP->getSourceElementType();

      if (Ty0 != CurTy) {

        LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");

        return TreeEntry::NeedToGather;

      }

    }


    // We don't combine GEPs with non-constant indexes.

    Type *Ty1 = VL0->getOperand(1)->getType();

    for (Value *V : VL) {

      auto *I = dyn_cast<GetElementPtrInst>(V);

      if (!I)

        continue;

      auto *Op = I->getOperand(1);

      if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

          (Op->getType() != Ty1 &&

           ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

            Op->getType()->getScalarSizeInBits() >

                DL->getIndexSizeInBits(

                    V->getType()->getPointerAddressSpace())))) {

        LLVM_DEBUG(

            dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");

        return TreeEntry::NeedToGather;

      }

    }


    return TreeEntry::Vectorize;

  }

  case Instruction::Store: {

    // Check if the stores are consecutive or if we need to swizzle them.

    llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

    // Avoid types that are padded when being allocated as scalars, while

    // being packed together in a vector (such as i1).

    if (DL->getTypeSizeInBits(ScalarTy) !=

        DL->getTypeAllocSizeInBits(ScalarTy)) {

      LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");

      return TreeEntry::NeedToGather;

    }

    // Make sure all stores in the bundle are simple - we can't vectorize

    // atomic or volatile stores.

    for (Value *V : VL) {

      auto *SI = cast<StoreInst>(V);

      if (!SI->isSimple()) {

        LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");

        return TreeEntry::NeedToGather;

      }

      PointerOps.push_back(SI->getPointerOperand());

    }


    // Check the order of pointer operands.

    if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {

      Value *Ptr0;

      Value *PtrN;

      if (CurrentOrder.empty()) {

        Ptr0 = PointerOps.front();

        PtrN = PointerOps.back();

      } else {

        Ptr0 = PointerOps[CurrentOrder.front()];

        PtrN = PointerOps[CurrentOrder.back()];

      }

      std::optional<int64_t> Dist =

          getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

      // Check that the sorted pointer operands are consecutive.

      if (static_cast<uint64_t>(*Dist) == VL.size() - 1)

        return TreeEntry::Vectorize;

    }


    LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

    return TreeEntry::NeedToGather;

  }

  case Instruction::Call: {

    if (S.getMainOp()->getType()->isFloatingPointTy() &&

        TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {

          auto *I = dyn_cast<Instruction>(V);

          return I && !I->isFast();

        }))

      return TreeEntry::NeedToGather;

    // Check if the calls are all to the same vectorizable intrinsic or

    // library function.

    CallInst *CI = cast<CallInst>(VL0);

    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);


    VFShape Shape = VFShape::get(

        CI->getFunctionType(),

        ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

        false /*HasGlobalPred*/);

    Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);


    if (!VecFunc && !isTriviallyVectorizable(ID)) {

      LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");

      return TreeEntry::NeedToGather;

    }

    Function *F = CI->getCalledFunction();

    unsigned NumArgs = CI->arg_size();

    SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);

    for (unsigned J = 0; J != NumArgs; ++J)

      if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI))

        ScalarArgs[J] = CI->getArgOperand(J);

    for (Value *V : VL) {

      CallInst *CI2 = dyn_cast<CallInst>(V);

      if (!CI2 || CI2->getCalledFunction() != F ||

          getVectorIntrinsicIDForCall(CI2, TLI) != ID ||

          (VecFunc &&

           VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||

          !CI->hasIdenticalOperandBundleSchema(*CI2)) {

        LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V

                          << "\n");

        return TreeEntry::NeedToGather;

      }

      // Some intrinsics have scalar arguments and should be same in order for

      // them to be vectorized.

      for (unsigned J = 0; J != NumArgs; ++J) {

        if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) {

          Value *A1J = CI2->getArgOperand(J);

          if (ScalarArgs[J] != A1J) {

            LLVM_DEBUG(dbgs()

                       << "SLP: mismatched arguments in call:" << *CI

                       << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");

            return TreeEntry::NeedToGather;

          }

        }

      }

      // Verify that the bundle operands are identical between the two calls.

      if (CI->hasOperandBundles() &&

          !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),

                      CI->op_begin() + CI->getBundleOperandsEndIndex(),

                      CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

        LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI

                          << "!=" << *V << '\n');

        return TreeEntry::NeedToGather;

      }

    }

    SmallVector<Type *> ArgTys =

        buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);

    auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());

    auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);

    if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())

      return TreeEntry::NeedToGather;


    return TreeEntry::Vectorize;

  }

  case Instruction::ShuffleVector: {

    if (!S.isAltShuffle()) {

      // REVEC can support non alternate shuffle.

      if (SLPReVec && getShufflevectorNumGroups(VL))

        return TreeEntry::Vectorize;

      // If this is not an alternate sequence of opcode like add-sub

      // then do not vectorize this instruction.

      LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");

      return TreeEntry::NeedToGather;

    }

    if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {

      LLVM_DEBUG(

          dbgs()

          << "SLP: ShuffleVector not vectorized, operands are buildvector and "

             "the whole alt sequence is not profitable.\n");

      return TreeEntry::NeedToGather;

    }


    return TreeEntry::Vectorize;

  }

  default:

    LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");

    return TreeEntry::NeedToGather;

  }

}


namespace {

/// Allows to correctly handle operands of the phi nodes based on the \p Main

/// PHINode order of incoming basic blocks/values.

class PHIHandler {

  DominatorTree &DT;

  PHINode *Main = nullptr;

  SmallVector<Value *> Phis;

  SmallVector<SmallVector<Value *>> Operands;


public:

  PHIHandler() = delete;

  PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)

      : DT(DT), Main(Main), Phis(Phis),

        Operands(Main->getNumIncomingValues(),

                 SmallVector<Value *>(Phis.size(), nullptr)) {}

  void buildOperands() {

    constexpr unsigned FastLimit = 4;

    if (Main->getNumIncomingValues() <= FastLimit) {

      for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {

        BasicBlock *InBB = Main->getIncomingBlock(I);

        if (!DT.isReachableFromEntry(InBB)) {

          Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));

          continue;

        }

        // Prepare the operand vector.

        for (auto [Idx, V] : enumerate(Phis)) {

          auto *P = dyn_cast<PHINode>(V);

          if (!P) {

            assert(isa<PoisonValue>(V) &&

                   "Expected isa instruction or poison value.");

            Operands[I][Idx] = V;

            continue;

          }

          if (P->getIncomingBlock(I) == InBB)

            Operands[I][Idx] = P->getIncomingValue(I);

          else

            Operands[I][Idx] = P->getIncomingValueForBlock(InBB);

        }

      }

      return;

    }

    SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>

        Blocks;

    for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {

      BasicBlock *InBB = Main->getIncomingBlock(I);

      if (!DT.isReachableFromEntry(InBB)) {

        Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));

        continue;

      }

      Blocks.try_emplace(InBB).first->second.push_back(I);

    }

    for (auto [Idx, V] : enumerate(Phis)) {

      if (isa<PoisonValue>(V)) {

        for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))

          Operands[I][Idx] = V;

        continue;

      }

      auto *P = cast<PHINode>(V);

      for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {

        BasicBlock *InBB = P->getIncomingBlock(I);

        if (InBB == Main->getIncomingBlock(I)) {

          if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))

            continue;

          Operands[I][Idx] = P->getIncomingValue(I);

          continue;

        }

        auto *It = Blocks.find(InBB);

        if (It == Blocks.end())

          continue;

        Operands[It->second.front()][Idx] = P->getIncomingValue(I);

      }

    }

    for (const auto &P : Blocks) {

      ArrayRef<unsigned> IncomingValues = P.second;

      if (IncomingValues.size() <= 1)

        continue;

      unsigned BasicI = IncomingValues.consume_front();

      for (unsigned I : IncomingValues) {

        assert(all_of(enumerate(Operands[I]),

                      [&](const auto &Data) {

                        return !Data.value() ||

                               Data.value() == Operands[BasicI][Data.index()];

                      }) &&

               "Expected empty operands list.");

        Operands[I] = Operands[BasicI];

      }

    }

  }

  ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }

};

} // namespace


/// Returns main/alternate instructions for the given \p VL. Unlike

/// getSameOpcode supports non-compatible instructions for better SplitVectorize

/// node support.

/// \returns first main/alt instructions, if only poisons and instruction with

/// only 2 opcodes exists. Returns pair of nullptr otherwise.

static std::pair<Instruction *, Instruction *>

getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {

  Instruction *MainOp = nullptr;

  Instruction *AltOp = nullptr;

  for (Value *V : VL) {

    if (isa<PoisonValue>(V))

      continue;

    auto *I = dyn_cast<Instruction>(V);

    if (!I)

      return {};

    if (!MainOp) {

      MainOp = I;

      continue;

    }

    if (MainOp->getOpcode() == I->getOpcode()) {

      if (I->getParent() != MainOp->getParent())

        return {};

      continue;

    }

    if (!AltOp) {

      AltOp = I;

      continue;

    }

    if (AltOp->getOpcode() == I->getOpcode()) {

      if (I->getParent() != AltOp->getParent())

        return {};

      continue;

    }

    return {};

  }

  if (!AltOp)

    return {};

  assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&

         "Expected different main and alt instructions.");

  return std::make_pair(MainOp, AltOp);

}


/// Checks that every instruction appears once in the list and if not, packs

/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of

/// unique scalars is extended by poison values to the whole register size.

///

/// \returns false if \p VL could not be uniquified, in which case \p VL is

/// unchanged and \p ReuseShuffleIndices is empty.

static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,

                                SmallVectorImpl<int> &ReuseShuffleIndices,

                                const TargetTransformInfo &TTI,

                                const TargetLibraryInfo &TLI,

                                const InstructionsState &S,

                                const BoUpSLP::EdgeInfo &UserTreeIdx,

                                bool TryPad = false) {

  // Check that every instruction appears once in this bundle.

  SmallVector<Value *> UniqueValues;

  SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());

  for (Value *V : VL) {

    if (isConstant(V)) {

      // Constants are always considered distinct, even if the same constant

      // appears multiple times in VL.

      ReuseShuffleIndices.emplace_back(

          isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());

      UniqueValues.emplace_back(V);

      continue;

    }

    auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

    ReuseShuffleIndices.emplace_back(Res.first->second);

    if (Res.second)

      UniqueValues.emplace_back(V);

  }


  // Easy case: VL has unique values and a "natural" size

  size_t NumUniqueScalarValues = UniqueValues.size();

  bool IsFullVectors = hasFullVectorsOrPowerOf2(

      TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);

  if (NumUniqueScalarValues == VL.size() &&

      (VectorizeNonPowerOf2 || IsFullVectors)) {

    ReuseShuffleIndices.clear();

    return true;

  }


  // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.

  if ((UserTreeIdx.UserTE &&

       UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||

      !hasFullVectorsOrPowerOf2(TTI, getValueType(VL.front()), VL.size())) {

    LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "

                         "for nodes with padding.\n");

    ReuseShuffleIndices.clear();

    return false;

  }


  LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");

  if (NumUniqueScalarValues <= 1 || !IsFullVectors ||

      (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {

         return isa<UndefValue>(V) || !isConstant(V);

       }))) {

    if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&

        S.getMainOp()->isSafeToRemove() &&

        (S.areInstructionsWithCopyableElements() ||

         all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {

      // Find the number of elements, which forms full vectors.

      unsigned PWSz = getFullVectorNumberOfElements(

          TTI, UniqueValues.front()->getType(), UniqueValues.size());

      PWSz = std::min<unsigned>(PWSz, VL.size());

      if (PWSz == VL.size()) {

        // We ended up with the same size after removing duplicates and

        // upgrading the resulting vector size to a "nice size". Just keep

        // the initial VL then.

        ReuseShuffleIndices.clear();

      } else {

        // Pad unique values with poison to grow the vector to a "nice" size

        SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),

                                                UniqueValues.end());

        PaddedUniqueValues.append(

            PWSz - UniqueValues.size(),

            PoisonValue::get(UniqueValues.front()->getType()));

        // Check that extended with poisons/copyable operations are still valid

        // for vectorization (div/rem are not allowed).

        if (!S.areInstructionsWithCopyableElements() &&

            !getSameOpcode(PaddedUniqueValues, TLI).valid()) {

          LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

          ReuseShuffleIndices.clear();

          return false;

        }

        VL = std::move(PaddedUniqueValues);

      }

      return true;

    }

    LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

    ReuseShuffleIndices.clear();

    return false;

  }

  VL = std::move(UniqueValues);

  return true;

}


bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,

                                const InstructionsState &LocalState,

                                SmallVectorImpl<Value *> &Op1,

                                SmallVectorImpl<Value *> &Op2,

                                OrdersType &ReorderIndices) const {

  constexpr unsigned SmallNodeSize = 4;

  if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||

      !SplitAlternateInstructions)

    return false;


  // Check if this is a duplicate of another split entry.

  LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()

                    << ".\n");

  for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {

    if (E->isSame(VL)) {

      LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "

                        << *LocalState.getMainOp() << ".\n");

      return false;

    }

    SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);

    if (all_of(VL, [&](Value *V) {

          return isa<PoisonValue>(V) || Values.contains(V);

        })) {

      LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");

      return false;

    }

  }


  ReorderIndices.assign(VL.size(), VL.size());

  SmallBitVector Op1Indices(VL.size());

  for (auto [Idx, V] : enumerate(VL)) {

    auto *I = dyn_cast<Instruction>(V);

    if (!I) {

      Op1.push_back(V);

      Op1Indices.set(Idx);

      continue;

    }

    if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&

         isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),

                           *TLI)) ||

        (LocalState.getAltOpcode() == LocalState.getOpcode() &&

         !isAlternateInstruction(I, LocalState.getMainOp(),

                                 LocalState.getAltOp(), *TLI))) {

      Op1.push_back(V);

      Op1Indices.set(Idx);

      continue;

    }

    Op2.push_back(V);

  }

  Type *ScalarTy = getValueType(VL.front());

  VectorType *VecTy = getWidenedType(ScalarTy, VL.size());

  unsigned Opcode0 = LocalState.getOpcode();

  unsigned Opcode1 = LocalState.getAltOpcode();

  SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));

  // Enable split node, only if all nodes do not form legal alternate

  // instruction (like X86 addsub).

  SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);

  SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);

  if (UOp1.size() <= 1 || UOp2.size() <= 1 ||

      TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||

      !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||

      !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))

    return false;

  // Enable split node, only if all nodes are power-of-2/full registers.

  unsigned Op1Cnt = 0, Op2Cnt = Op1.size();

  for (unsigned Idx : seq<unsigned>(VL.size())) {

    if (Op1Indices.test(Idx)) {

      ReorderIndices[Op1Cnt] = Idx;

      ++Op1Cnt;

    } else {

      ReorderIndices[Op2Cnt] = Idx;

      ++Op2Cnt;

    }

  }

  if (isIdentityOrder(ReorderIndices))

    ReorderIndices.clear();

  SmallVector<int> Mask;

  if (!ReorderIndices.empty())

    inversePermutation(ReorderIndices, Mask);

  unsigned NumParts = TTI->getNumberOfParts(VecTy);

  VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());

  VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());

  // Check non-profitable single register ops, which better to be represented

  // as alternate ops.

  if (NumParts >= VL.size())

    return false;

  constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;

  InstructionCost InsertCost = ::getShuffleCost(

      *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);

  FixedVectorType *SubVecTy =

      getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));

  InstructionCost NewShuffleCost =

      ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);

  if (!LocalState.isCmpOp() && NumParts <= 1 &&

      (Mask.empty() || InsertCost >= NewShuffleCost))

    return false;

  if ((LocalState.getMainOp()->isBinaryOp() &&

       LocalState.getAltOp()->isBinaryOp() &&

       (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||

        LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||

      (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||

      (LocalState.getMainOp()->isUnaryOp() &&

       LocalState.getAltOp()->isUnaryOp())) {

    InstructionCost OriginalVecOpsCost =

        TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +

        TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);

    SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);

    for (unsigned Idx : seq<unsigned>(VL.size())) {

      if (isa<PoisonValue>(VL[Idx]))

        continue;

      OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());

    }

    InstructionCost OriginalCost =

        OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,

                                              VecTy, OriginalMask, Kind);

    InstructionCost NewVecOpsCost =

        TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +

        TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);

    InstructionCost NewCost =

        NewVecOpsCost + InsertCost +

        (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&

                 VectorizableTree.front()->getOpcode() == Instruction::Store

             ? NewShuffleCost

             : 0);

    // If not profitable to split - exit.

    if (NewCost >= OriginalCost)

      return false;

  }

  return true;

}


namespace {

/// Class accepts incoming list of values, checks if it is able to model

/// "copyable" values as compatible operations, and generates the list of values

/// for scheduling and list of operands doe the new nodes.

class InstructionsCompatibilityAnalysis {

  DominatorTree &DT;

  const DataLayout &DL;

  const TargetTransformInfo &TTI;

  const TargetLibraryInfo &TLI;

  unsigned MainOpcode = 0;

  Instruction *MainOp = nullptr;


  /// Checks if the opcode is supported as the main opcode for copyable

  /// elements.

  static bool isSupportedOpcode(const unsigned Opcode) {

    return Opcode == Instruction::Add || Opcode == Instruction::LShr;

  }


  /// Identifies the best candidate value, which represents main opcode

  /// operation.

  /// Currently the best candidate is the Add instruction with the parent

  /// block with the highest DFS incoming number (block, that dominates other).

  void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {

    BasicBlock *Parent = nullptr;

    // Checks if the instruction has supported opcode.

    auto IsSupportedInstruction = [&](Instruction *I) {

      return I && isSupportedOpcode(I->getOpcode()) &&

             (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));

    };

    // Exclude operands instructions immediately to improve compile time, it

    // will be unable to schedule anyway.

    SmallDenseSet<Value *, 8> Operands;

    SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;

    for (Value *V : VL) {

      auto *I = dyn_cast<Instruction>(V);

      if (!I)

        continue;

      if (!DT.isReachableFromEntry(I->getParent()))

        continue;

      if (Candidates.empty()) {

        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);

        Parent = I->getParent();

        Operands.insert(I->op_begin(), I->op_end());

        continue;

      }

      if (Parent == I->getParent()) {

        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);

        Operands.insert(I->op_begin(), I->op_end());

        continue;

      }

      auto *NodeA = DT.getNode(Parent);

      auto *NodeB = DT.getNode(I->getParent());

      assert(NodeA && "Should only process reachable instructions");

      assert(NodeB && "Should only process reachable instructions");

      assert((NodeA == NodeB) ==

                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

             "Different nodes should have different DFS numbers");

      if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {

        Candidates.clear();

        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);

        Parent = I->getParent();

        Operands.clear();

        Operands.insert(I->op_begin(), I->op_end());

      }

    }

    unsigned BestOpcodeNum = 0;

    MainOp = nullptr;

    for (const auto &P : Candidates) {

      if (P.second.size() < BestOpcodeNum)

        continue;

      for (Instruction *I : P.second) {

        if (IsSupportedInstruction(I) && !Operands.contains(I)) {

          MainOp = I;

          BestOpcodeNum = P.second.size();

          break;

        }

      }

    }

    if (MainOp)

      MainOpcode = MainOp->getOpcode();

  }


  /// Returns the idempotent value for the \p MainOp with the detected \p

  /// MainOpcode. For Add, returns 0. For Or, it should choose between false and

  /// the operand itself, since V or V == V.

  Value *selectBestIdempotentValue() const {

    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");

    return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),

                                          !MainOp->isCommutative());

  }


  /// Returns the value and operands for the \p V, considering if it is original

  /// instruction and its actual operands should be returned, or it is a

  /// copyable element and its should be represented as idempotent instruction.

  SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {

    if (isa<PoisonValue>(V))

      return {V, V};

    if (!S.isCopyableElement(V))

      return convertTo(cast<Instruction>(V), S).second;

    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");

    return {V, selectBestIdempotentValue()};

  }


  /// Builds operands for the original instructions.

  void

  buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,

                        SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {


    unsigned ShuffleOrOp =

        S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

    Instruction *VL0 = S.getMainOp();


    switch (ShuffleOrOp) {

    case Instruction::PHI: {

      auto *PH = cast<PHINode>(VL0);


      // Keeps the reordered operands to avoid code duplication.

      PHIHandler Handler(DT, PH, VL);

      Handler.buildOperands();

      Operands.assign(PH->getNumOperands(), {});

      for (unsigned I : seq<unsigned>(PH->getNumOperands()))

        Operands[I].assign(Handler.getOperands(I).begin(),

                           Handler.getOperands(I).end());

      return;

    }

    case Instruction::ExtractValue:

    case Instruction::ExtractElement:

      // This is a special case, as it does not gather, but at the same time

      // we are not extending buildTree_rec() towards the operands.

      Operands.assign(1, {VL.size(), VL0->getOperand(0)});

      return;

    case Instruction::InsertElement:

      Operands.assign(2, {VL.size(), nullptr});

      for (auto [Idx, V] : enumerate(VL)) {

        auto *IE = cast<InsertElementInst>(V);

        for (auto [OpIdx, Ops] : enumerate(Operands))

          Ops[Idx] = IE->getOperand(OpIdx);

      }

      return;

    case Instruction::Load:

      Operands.assign(

          1, {VL.size(),

              PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});

      for (auto [V, Op] : zip(VL, Operands.back())) {

        auto *LI = dyn_cast<LoadInst>(V);

        if (!LI)

          continue;

        Op = LI->getPointerOperand();

      }

      return;

    case Instruction::ZExt:

    case Instruction::SExt:

    case Instruction::FPToUI:

    case Instruction::FPToSI:

    case Instruction::FPExt:

    case Instruction::PtrToInt:

    case Instruction::IntToPtr:

    case Instruction::SIToFP:

    case Instruction::UIToFP:

    case Instruction::Trunc:

    case Instruction::FPTrunc:

    case Instruction::BitCast:

    case Instruction::ICmp:

    case Instruction::FCmp:

    case Instruction::Select:

    case Instruction::FNeg:

    case Instruction::Add:

    case Instruction::FAdd:

    case Instruction::Sub:

    case Instruction::FSub:

    case Instruction::Mul:

    case Instruction::FMul:

    case Instruction::UDiv:

    case Instruction::SDiv:

    case Instruction::FDiv:

    case Instruction::URem:

    case Instruction::SRem:

    case Instruction::FRem:

    case Instruction::Shl:

    case Instruction::LShr:

    case Instruction::AShr:

    case Instruction::And:

    case Instruction::Or:

    case Instruction::Xor:

    case Instruction::Freeze:

    case Instruction::Store:

    case Instruction::ShuffleVector:

      Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});

      for (auto [Idx, V] : enumerate(VL)) {

        auto *I = dyn_cast<Instruction>(V);

        if (!I) {

          for (auto [OpIdx, Ops] : enumerate(Operands))

            Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());

          continue;

        }

        auto [Op, ConvertedOps] = convertTo(I, S);

        for (auto [OpIdx, Ops] : enumerate(Operands))

          Ops[Idx] = ConvertedOps[OpIdx];

      }

      return;

    case Instruction::GetElementPtr: {

      Operands.assign(2, {VL.size(), nullptr});

      // Need to cast all indices to the same type before vectorization to

      // avoid crash.

      // Required to be able to find correct matches between different gather

      // nodes and reuse the vectorized values rather than trying to gather them

      // again.

      const unsigned IndexIdx = 1;

      Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();

      Type *Ty =

          all_of(VL,

                 [&](Value *V) {

                   auto *GEP = dyn_cast<GetElementPtrInst>(V);

                   return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();

                 })

              ? VL0Ty

              : DL.getIndexType(cast<GetElementPtrInst>(VL0)

                                    ->getPointerOperandType()

                                    ->getScalarType());

      for (auto [Idx, V] : enumerate(VL)) {

        auto *GEP = dyn_cast<GetElementPtrInst>(V);

        if (!GEP) {

          Operands[0][Idx] = V;

          Operands[1][Idx] = ConstantInt::getNullValue(Ty);

          continue;

        }

        Operands[0][Idx] = GEP->getPointerOperand();

        auto *Op = GEP->getOperand(IndexIdx);

        auto *CI = dyn_cast<ConstantInt>(Op);

        Operands[1][Idx] = CI ? ConstantFoldIntegerCast(

                                    CI, Ty, CI->getValue().isSignBitSet(), DL)

                              : Op;

      }

      return;

    }

    case Instruction::Call: {

      auto *CI = cast<CallInst>(VL0);

      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, &TLI);

      for (unsigned Idx : seq<unsigned>(CI->arg_size())) {

        if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, &TTI))

          continue;

        auto &Ops = Operands.emplace_back();

        for (Value *V : VL) {

          auto *I = dyn_cast<Instruction>(V);

          Ops.push_back(I ? I->getOperand(Idx)

                          : PoisonValue::get(VL0->getOperand(Idx)->getType()));

        }

      }

      return;

    }

    default:

      break;

    }

    llvm_unreachable("Unexpected vectorization of the instructions.");

  }


public:

  InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,

                                    const TargetTransformInfo &TTI,

                                    const TargetLibraryInfo &TLI)

      : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}


  InstructionsState

  buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,

                         bool TryCopyableElementsVectorization,

                         bool WithProfitabilityCheck = false,

                         bool SkipSameCodeCheck = false) {

    InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))

                              ? InstructionsState::invalid()

                              : getSameOpcode(VL, TLI);

    if (S)

      return S;

    if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)

      return S;

    findAndSetMainInstruction(VL, R);

    if (!MainOp)

      return InstructionsState::invalid();

    S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);

    if (!WithProfitabilityCheck)

      return S;

    // Check if it is profitable to vectorize the instruction.

    SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);

    auto BuildCandidates =

        [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,

           Value *V2) {

          if (V1 != V2 && isa<PHINode>(V1))

            return;

          auto *I1 = dyn_cast<Instruction>(V1);

          auto *I2 = dyn_cast<Instruction>(V2);

          if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&

              I1->getParent() != I2->getParent())

            return;

          Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);

        };

    if (VL.size() == 2) {

      // Check if the operands allow better vectorization.

      SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;

      BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);

      BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);

      bool Res = !Candidates1.empty() && !Candidates2.empty() &&

                 R.findBestRootPair(Candidates1) &&

                 R.findBestRootPair(Candidates2);

      if (!Res && isCommutative(MainOp)) {

        Candidates1.clear();

        Candidates2.clear();

        BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);

        BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);

        Res = !Candidates1.empty() && !Candidates2.empty() &&

              R.findBestRootPair(Candidates1) &&

              R.findBestRootPair(Candidates2);

      }

      if (!Res)

        return InstructionsState::invalid();

      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;

      InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);

      InstructionCost VectorCost;

      FixedVectorType *VecTy =

          getWidenedType(S.getMainOp()->getType(), VL.size());

      switch (MainOpcode) {

      case Instruction::Add:

      case Instruction::LShr:

        VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);

        break;

      default:

        llvm_unreachable("Unexpected instruction.");

      }

      if (VectorCost > ScalarCost)

        return InstructionsState::invalid();

      return S;

    }

    assert(Operands.size() == 2 && "Unexpected number of operands!");

    unsigned CopyableNum =

        count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });

    if (CopyableNum < VL.size() / 2)

      return S;

    // Too many phi copyables - exit.

    const unsigned Limit = VL.size() / 24;

    if ((CopyableNum >= VL.size() - Limit ||

         (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||

         CopyableNum >= MaxPHINumOperands) &&

        all_of(VL, [&](Value *V) {

          return isa<PHINode>(V) || !S.isCopyableElement(V);

        }))

      return InstructionsState::invalid();

    // Check profitability if number of copyables > VL.size() / 2.

    // 1. Reorder operands for better matching.

    if (isCommutative(MainOp)) {

      for (auto &Ops : Operands) {

        // Make instructions the first operands.

        if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {

          std::swap(Ops.front(), Ops.back());

          continue;

        }

        // Make constants the second operands.

        if (isa<Constant>(Ops.front())) {

          std::swap(Ops.front(), Ops.back());

          continue;

        }

      }

    }

    // 2. Check, if operands can be vectorized.

    if (count_if(Operands.back(), IsaPred<Instruction>) > 1)

      return InstructionsState::invalid();

    auto CheckOperand = [&](ArrayRef<Value *> Ops) {

      if (allConstant(Ops) || isSplat(Ops))

        return true;

      // Check if it is "almost" splat, i.e. has >= 4 elements and only single

      // one is different.

      constexpr unsigned Limit = 4;

      if (Operands.front().size() >= Limit) {

        SmallDenseMap<const Value *, unsigned> Counters;

        for (Value *V : Ops) {

          if (isa<UndefValue>(V))

            continue;

          ++Counters[V];

        }

        if (Counters.size() == 2 &&

            any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {

              return C.second == 1;

            }))

          return true;

      }

      // First operand not a constant or splat? Last attempt - check for

      // potential vectorization.

      InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);

      InstructionsState OpS = Analysis.buildInstructionsState(

          Ops, R, /*TryCopyableElementsVectorization=*/true);

      if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))

        return false;

      unsigned CopyableNum =

          count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });

      return CopyableNum <= VL.size() / 2;

    };

    if (!CheckOperand(Operands.front()))

      return InstructionsState::invalid();


    return S;

  }


  SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,

                                                ArrayRef<Value *> VL) {

    assert(S && "Invalid state!");

    SmallVector<BoUpSLP::ValueList> Operands;

    if (S.areInstructionsWithCopyableElements()) {

      MainOp = S.getMainOp();

      MainOpcode = S.getOpcode();

      Operands.assign(MainOp->getNumOperands(),

                      BoUpSLP::ValueList(VL.size(), nullptr));

      for (auto [Idx, V] : enumerate(VL)) {

        SmallVector<Value *> OperandsForValue = getOperands(S, V);

        for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))

          Operands[OperandIdx][Idx] = Operand;

      }

    } else {

      buildOriginalOperands(S, VL, Operands);

    }

    return Operands;

  }

};

} // namespace


BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(

    ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,

    bool TryCopyableElementsVectorization) const {

  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");


  InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);

  InstructionsState S = Analysis.buildInstructionsState(

      VL, *this, TryCopyableElementsVectorization,

      /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);


  // Don't go into catchswitch blocks, which can happen with PHIs.

  // Such blocks can only have PHIs and the catchswitch.  There is no

  // place to insert a shuffle if we need to, so just avoid that issue.

  if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {

    LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");

    // Do not try to pack to avoid extra instructions here.

    return ScalarsVectorizationLegality(S, /*IsLegal=*/false,

                                        /*TryToFindDuplicates=*/false);

  }


  // Check if this is a duplicate of another entry.

  if (S) {

    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");

    for (TreeEntry *E : getTreeEntries(S.getMainOp())) {

      if (E->isSame(VL)) {

        LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()

                          << ".\n");

        return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

      }

      SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);

      if (all_of(VL, [&](Value *V) {

            return isa<PoisonValue>(V) || Values.contains(V);

          })) {

        LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");

        return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

      }

    }

  }


  // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of

  // a load), in which case peek through to include it in the tree, without

  // ballooning over-budget.

  if (Depth >= RecursionMaxDepth &&

      !(S && !S.isAltShuffle() && VL.size() >= 4 &&

        (match(S.getMainOp(), m_Load(m_Value())) ||

         all_of(VL, [&S](const Value *I) {

           return match(I,

                        m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&

                  cast<Instruction>(I)->getOpcode() == S.getOpcode();

         })))) {

    LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");

    return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

  }


  // Don't handle scalable vectors

  if (S && S.getOpcode() == Instruction::ExtractElement &&

      isa<ScalableVectorType>(

          cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {

    LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");

    return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

  }


  // Don't handle vectors.

  if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {

    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");

    // Do not try to pack to avoid extra instructions here.

    return ScalarsVectorizationLegality(S, /*IsLegal=*/false,

                                        /*TryToFindDuplicates=*/false);

  }


  // If all of the operands are identical or constant we have a simple solution.

  // If we deal with insert/extract instructions, they all must have constant

  // indices, otherwise we should gather them, not try to vectorize.

  // If alternate op node with 2 elements with gathered operands - do not

  // vectorize.

  auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {

    if (!S || !S.isAltShuffle() || VL.size() > 2)

      return false;

    if (VectorizableTree.size() < MinTreeSize)

      return false;

    if (Depth >= RecursionMaxDepth - 1)

      return true;

    // Check if all operands are extracts, part of vector node or can build a

    // regular vectorize node.

    SmallVector<unsigned, 8> InstsCount;

    for (Value *V : VL) {

      auto *I = cast<Instruction>(V);

      InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {

        return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);

      }));

    }

    bool IsCommutative =

        isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());

    if ((IsCommutative &&

         std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||

        (!IsCommutative &&

         all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))

      return true;

    assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");

    SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

    auto *I1 = cast<Instruction>(VL.front());

    auto *I2 = cast<Instruction>(VL.back());

    for (int Op : seq<int>(S.getMainOp()->getNumOperands()))

      Candidates.emplace_back().emplace_back(I1->getOperand(Op),

                                             I2->getOperand(Op));

    if (static_cast<unsigned>(count_if(

            Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

              return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);

            })) >= S.getMainOp()->getNumOperands() / 2)

      return false;

    if (S.getMainOp()->getNumOperands() > 2)

      return true;

    if (IsCommutative) {

      // Check permuted operands.

      Candidates.clear();

      for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)

        Candidates.emplace_back().emplace_back(I1->getOperand(Op),

                                               I2->getOperand((Op + 1) % E));

      if (any_of(

              Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

                return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);

              }))

        return false;

    }

    return true;

  };

  SmallVector<unsigned> SortedIndices;

  BasicBlock *BB = nullptr;

  bool IsScatterVectorizeUserTE =

      UserTreeIdx.UserTE &&

      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

  bool AreAllSameBlock = S.valid();

  bool AreScatterAllGEPSameBlock =

      (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&

       VL.size() > 2 &&

       all_of(VL,

              [&BB](Value *V) {

                auto *I = dyn_cast<GetElementPtrInst>(V);

                if (!I)

                  return doesNotNeedToBeScheduled(V);

                if (!BB)

                  BB = I->getParent();

                return BB == I->getParent() && I->getNumOperands() == 2;

              }) &&

       BB &&

       sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,

                       SortedIndices));

  bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;

  if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||

      (S &&

       isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(

           S.getMainOp()) &&

       !all_of(VL, isVectorLikeInstWithConstOps)) ||

      NotProfitableForVectorization(VL)) {

    if (!S) {

      LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "

                           "C,S,B,O, small shuffle. \n";

                 dbgs() << "[";

                 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });

                 dbgs() << "]\n");

      return ScalarsVectorizationLegality(S, /*IsLegal=*/false,

                                          /*TryToFindDuplicates=*/true,

                                          /*TrySplitVectorize=*/true);

    }

    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";

               dbgs() << "[";

               interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });

               dbgs() << "]\n");

    return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

  }


  // Don't vectorize ephemeral values.

  if (S && !EphValues.empty()) {

    for (Value *V : VL) {

      if (EphValues.count(V)) {

        LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

                          << ") is ephemeral.\n");

        // Do not try to pack to avoid extra instructions here.

        return ScalarsVectorizationLegality(S, /*IsLegal=*/false,

                                            /*TryToFindDuplicates=*/false);

      }

    }

  }


  // We now know that this is a vector of instructions of the same type from

  // the same block.


  // Check that none of the instructions in the bundle are already in the tree

  // and the node may be not profitable for the vectorization as the small

  // alternate node.

  if (S && S.isAltShuffle()) {

    auto GetNumVectorizedExtracted = [&]() {

      APInt Extracted = APInt::getZero(VL.size());

      APInt Vectorized = APInt::getAllOnes(VL.size());

      for (auto [Idx, V] : enumerate(VL)) {

        auto *I = dyn_cast<Instruction>(V);

        if (!I || doesNotNeedToBeScheduled(I) ||

            all_of(I->operands(), [&](const Use &U) {

              return isa<ExtractElementInst>(U.get());

            }))

          continue;

        if (isVectorized(I))

          Vectorized.clearBit(Idx);

        else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))

          Extracted.setBit(Idx);

      }

      return std::make_pair(Vectorized, Extracted);

    };

    auto [Vectorized, Extracted] = GetNumVectorizedExtracted();

    constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;

    bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;

    if (!Vectorized.isAllOnes() && !PreferScalarize) {

      // Rough cost estimation, if the vector code (+ potential extracts) is

      // more profitable than the scalar + buildvector.

      Type *ScalarTy = VL.front()->getType();

      auto *VecTy = getWidenedType(ScalarTy, VL.size());

      InstructionCost VectorizeCostEstimate =

          ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +

          ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,

                                     /*Insert=*/false, /*Extract=*/true, Kind);

      InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(

          *TTI, ScalarTy, VecTy, Vectorized,

          /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);

      PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;

    }

    if (PreferScalarize) {

      LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "

                           "node is not profitable.\n");

      return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

    }

  }


  // The reduction nodes (stored in UserIgnoreList) also should stay scalar.

  if (UserIgnoreList && !UserIgnoreList->empty()) {

    for (Value *V : VL) {

      if (UserIgnoreList->contains(V)) {

        LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

        return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

      }

    }

  }


  // Special processing for sorted pointers for ScatterVectorize node with

  // constant indeces only.

  if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {

    assert(VL.front()->getType()->isPointerTy() &&

           count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&

           "Expected pointers only.");

    // Reset S to make it GetElementPtr kind of node.

    const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);

    assert(It != VL.end() && "Expected at least one GEP.");

    S = getSameOpcode(*It, *TLI);

  }


  // Check that all of the users of the scalars that we want to vectorize are

  // schedulable.

  Instruction *VL0 = S.getMainOp();

  BB = VL0->getParent();


  if (S &&

      (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||

       !DT->isReachableFromEntry(BB))) {

    // Don't go into unreachable blocks. They may contain instructions with

    // dependency cycles which confuse the final scheduling.

    // Do not vectorize EH and non-returning blocks, not profitable in most

    // cases.

    LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");

    return ScalarsVectorizationLegality(S, /*IsLegal=*/false);

  }

  return ScalarsVectorizationLegality(S, /*IsLegal=*/true);

}


void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,

                           const EdgeInfo &UserTreeIdx,

                           unsigned InterleaveFactor) {

  assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");


  SmallVector<int> ReuseShuffleIndices;

  SmallVector<Value *> VL(VLRef);


  // Tries to build split node.

  auto TrySplitNode = [&](const InstructionsState &LocalState) {

    SmallVector<Value *> Op1, Op2;

    OrdersType ReorderIndices;

    if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))

      return false;


    SmallVector<Value *> NewVL(VL.size());

    copy(Op1, NewVL.begin());

    copy(Op2, std::next(NewVL.begin(), Op1.size()));

    auto Invalid = ScheduleBundle::invalid();

    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,

                            UserTreeIdx, {}, ReorderIndices);

    LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());

    auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {

      InstructionsState S = getSameOpcode(Op, *TLI);

      if (S && (isa<LoadInst>(S.getMainOp()) ||

                getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {

        // Build gather node for loads, they will be gathered later.

        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),

                                                    Idx == 0 ? 0 : Op1.size());

        (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});

      } else {

        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),

                                                    Idx == 0 ? 0 : Op1.size());

        buildTreeRec(Op, Depth, {TE, Idx});

      }

    };

    AddNode(Op1, 0);

    AddNode(Op2, 1);

    return true;

  };


  auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {

    bool AreConsts = false;

    for (Value *V : VL) {

      if (isa<PoisonValue>(V))

        continue;

      if (isa<Constant>(V)) {

        AreConsts = true;

        continue;

      }

      if (!isa<PHINode>(V))

        return false;

    }

    return AreConsts;

  };

  if (AreOnlyConstsWithPHIs(VL)) {

    LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");

    newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);

    return;

  }


  ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(

      VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);

  InstructionsState S = Legality.getInstructionsState();

  if (!Legality.isLegal()) {

    if (Legality.trySplitVectorize()) {

      auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);

      // Last chance to try to vectorize alternate node.

      if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))

        return;

    }

    if (!S)

      Legality = getScalarsVectorizationLegality(

          VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);

    if (!Legality.isLegal()) {

      if (Legality.tryToFindDuplicates())

        tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,

                            UserTreeIdx);


      newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);

      return;

    }

    S = Legality.getInstructionsState();

  }


  // FIXME: investigate if there are profitable cases for VL.size() <= 4.

  if (S.isAltShuffle() && TrySplitNode(S))

    return;


  // Check that every instruction appears once in this bundle.

  if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,

                           /*TryPad=*/true)) {

    newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);

    return;

  }


  // Perform specific checks for each particular instruction kind.

  bool IsScatterVectorizeUserTE =

      UserTreeIdx.UserTE &&

      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

  OrdersType CurrentOrder;

  SmallVector<Value *> PointerOps;

  TreeEntry::EntryState State = getScalarsVectorizationState(

      S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);

  if (State == TreeEntry::NeedToGather) {

    newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);

    return;

  }


  Instruction *VL0 = S.getMainOp();

  BasicBlock *BB = VL0->getParent();

  auto &BSRef = BlocksSchedules[BB];

  if (!BSRef)

    BSRef = std::make_unique<BlockScheduling>(BB);


  BlockScheduling &BS = *BSRef;


  SetVector<Value *> UniqueValues(llvm::from_range, VL);

  std::optional<ScheduleBundle *> BundlePtr =

      BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);

#ifdef EXPENSIVE_CHECKS

  // Make sure we didn't break any internal invariants

  BS.verify();

#endif

  if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {

    LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");

    // Last chance to try to vectorize alternate node.

    if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))

      return;

    newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);

    NonScheduledFirst.insert(VL.front());

    if (S.getOpcode() == Instruction::Load &&

        BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)

      registerNonVectorizableLoads(ArrayRef(VL));

    return;

  }

  InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);

  SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);

  ScheduleBundle Empty;

  ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;

  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");


  unsigned ShuffleOrOp =

      S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

  auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {

    // Postpone PHI nodes creation

    SmallVector<unsigned> PHIOps;

    for (unsigned I : seq<unsigned>(Operands.size())) {

      ArrayRef<Value *> Op = Operands[I];

      if (Op.empty())

        continue;

      InstructionsState S = getSameOpcode(Op, *TLI);

      if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())

        buildTreeRec(Op, Depth + 1, {TE, I});

      else

        PHIOps.push_back(I);

    }

    for (unsigned I : PHIOps)

      buildTreeRec(Operands[I], Depth + 1, {TE, I});

  };

  switch (ShuffleOrOp) {

    case Instruction::PHI: {

      TreeEntry *TE =

          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);

      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";

                 TE->dump());


      TE->setOperands(Operands);

      CreateOperandNodes(TE, Operands);

      return;

    }

    case Instruction::ExtractValue:

    case Instruction::ExtractElement: {

      if (CurrentOrder.empty()) {

        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");

      } else {

        LLVM_DEBUG({

          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "

                    "with order";

          for (unsigned Idx : CurrentOrder)

            dbgs() << " " << Idx;

          dbgs() << "\n";

        });

        fixupOrderingIndices(CurrentOrder);

      }

      // Insert new order with initial value 0, if it does not exist,

      // otherwise return the iterator to the existing one.

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices, CurrentOrder);

      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "

                           "(ExtractValueInst/ExtractElementInst).\n";

                 TE->dump());

      // This is a special case, as it does not gather, but at the same time

      // we are not extending buildTreeRec() towards the operands.

      TE->setOperands(Operands);

      return;

    }

    case Instruction::InsertElement: {

      assert(ReuseShuffleIndices.empty() && "All inserts should be unique");


      auto OrdCompare = [](const std::pair<int, int> &P1,

                           const std::pair<int, int> &P2) {

        return P1.first > P2.first;

      };

      PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,

                    decltype(OrdCompare)>

          Indices(OrdCompare);

      for (int I = 0, E = VL.size(); I < E; ++I) {

        unsigned Idx = *getElementIndex(VL[I]);

        Indices.emplace(Idx, I);

      }

      OrdersType CurrentOrder(VL.size(), VL.size());

      bool IsIdentity = true;

      for (int I = 0, E = VL.size(); I < E; ++I) {

        CurrentOrder[Indices.top().second] = I;

        IsIdentity &= Indices.top().second == I;

        Indices.pop();

      }

      if (IsIdentity)

        CurrentOrder.clear();

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   {}, CurrentOrder);

      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";

                 TE->dump());


      TE->setOperands(Operands);

      buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});

      return;

    }

    case Instruction::Load: {

      // Check that a vectorized load would load the same memory as a scalar

      // load. For example, we don't want to vectorize loads that are smaller

      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

      // treats loading/storing it as an i8 struct. If we vectorize loads/stores

      // from such a struct, we read/write packed bits disagreeing with the

      // unvectorized version.

      TreeEntry *TE = nullptr;

      fixupOrderingIndices(CurrentOrder);

      switch (State) {

      case TreeEntry::Vectorize:

        TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                          ReuseShuffleIndices, CurrentOrder, InterleaveFactor);

        if (CurrentOrder.empty())

          LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";

                     TE->dump());

        else

          LLVM_DEBUG(dbgs()

                         << "SLP: added a new TreeEntry (jumbled LoadInst).\n";

                     TE->dump());

        break;

      case TreeEntry::CompressVectorize:

        // Vectorizing non-consecutive loads with (masked)load + compress.

        TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,

                          UserTreeIdx, ReuseShuffleIndices, CurrentOrder);

        LLVM_DEBUG(

            dbgs()

                << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";

            TE->dump());

        break;

      case TreeEntry::StridedVectorize:

        // Vectorizing non-consecutive loads with `llvm.masked.gather`.

        TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,

                          UserTreeIdx, ReuseShuffleIndices, CurrentOrder);

        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";

                   TE->dump());

        break;

      case TreeEntry::ScatterVectorize:

        // Vectorizing non-consecutive loads with `llvm.masked.gather`.

        TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

                          UserTreeIdx, ReuseShuffleIndices);

        LLVM_DEBUG(

            dbgs()

                << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";

            TE->dump());

        break;

      case TreeEntry::CombinedVectorize:

      case TreeEntry::SplitVectorize:

      case TreeEntry::NeedToGather:

        llvm_unreachable("Unexpected loads state.");

      }

      if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {

        assert(Operands.size() == 1 && "Expected a single operand only");

        SmallVector<int> Mask;

        inversePermutation(CurrentOrder, Mask);

        reorderScalars(Operands.front(), Mask);

      }

      TE->setOperands(Operands);

      if (State == TreeEntry::ScatterVectorize)

        buildTreeRec(PointerOps, Depth + 1, {TE, 0});

      return;

    }

    case Instruction::ZExt:

    case Instruction::SExt:

    case Instruction::FPToUI:

    case Instruction::FPToSI:

    case Instruction::FPExt:

    case Instruction::PtrToInt:

    case Instruction::IntToPtr:

    case Instruction::SIToFP:

    case Instruction::UIToFP:

    case Instruction::Trunc:

    case Instruction::FPTrunc:

    case Instruction::BitCast: {

      auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(

          std::make_pair(std::numeric_limits<unsigned>::min(),

                         std::numeric_limits<unsigned>::max()));

      if (ShuffleOrOp == Instruction::ZExt ||

          ShuffleOrOp == Instruction::SExt) {

        CastMaxMinBWSizes = std::make_pair(

            std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),

                               PrevMaxBW),

            std::min<unsigned>(

                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),

                PrevMinBW));

      } else if (ShuffleOrOp == Instruction::Trunc) {

        CastMaxMinBWSizes = std::make_pair(

            std::max<unsigned>(

                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),

                PrevMaxBW),

            std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),

                               PrevMinBW));

      }

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices);

      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";

                 TE->dump());


      TE->setOperands(Operands);

      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))

        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});

      if (ShuffleOrOp == Instruction::Trunc) {

        ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

      } else if (ShuffleOrOp == Instruction::SIToFP ||

                 ShuffleOrOp == Instruction::UIToFP) {

        unsigned NumSignBits =

            ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);

        if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {

          APInt Mask = DB->getDemandedBits(OpI);

          NumSignBits = std::max(NumSignBits, Mask.countl_zero());

        }

        if (NumSignBits * 2 >=

            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))

          ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

      }

      return;

    }

    case Instruction::ICmp:

    case Instruction::FCmp: {

      // Check that all of the compares have the same predicate.

      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices);

      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";

                 TE->dump());


      VLOperands Ops(VL, Operands, S, *this);

      if (cast<CmpInst>(VL0)->isCommutative()) {

        // Commutative predicate - collect + sort operands of the instructions

        // so that each side is more likely to have the same opcode.

        assert(P0 == CmpInst::getSwappedPredicate(P0) &&

               "Commutative Predicate mismatch");

        Ops.reorder();

        Operands.front() = Ops.getVL(0);

        Operands.back() = Ops.getVL(1);

      } else {

        // Collect operands - commute if it uses the swapped predicate.

        for (auto [Idx, V] : enumerate(VL)) {

          if (isa<PoisonValue>(V))

            continue;

          auto *Cmp = cast<CmpInst>(V);

          if (Cmp->getPredicate() != P0)

            std::swap(Operands.front()[Idx], Operands.back()[Idx]);

        }

      }

      TE->setOperands(Operands);

      buildTreeRec(Operands.front(), Depth + 1, {TE, 0});

      buildTreeRec(Operands.back(), Depth + 1, {TE, 1});

      if (ShuffleOrOp == Instruction::ICmp) {

        unsigned NumSignBits0 =

            ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);

        if (NumSignBits0 * 2 >=

            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))

          ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

        unsigned NumSignBits1 =

            ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);

        if (NumSignBits1 * 2 >=

            DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))

          ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);

      }

      return;

    }

    case Instruction::Select:

    case Instruction::FNeg:

    case Instruction::Add:

    case Instruction::FAdd:

    case Instruction::Sub:

    case Instruction::FSub:

    case Instruction::Mul:

    case Instruction::FMul:

    case Instruction::UDiv:

    case Instruction::SDiv:

    case Instruction::FDiv:

    case Instruction::URem:

    case Instruction::SRem:

    case Instruction::FRem:

    case Instruction::Shl:

    case Instruction::LShr:

    case Instruction::AShr:

    case Instruction::And:

    case Instruction::Or:

    case Instruction::Xor:

    case Instruction::Freeze: {

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices);

      LLVM_DEBUG(

          dbgs() << "SLP: added a new TreeEntry "

                    "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";

          TE->dump());


      if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {

        VLOperands Ops(VL, Operands, S, *this);

        Ops.reorder();

        Operands[0] = Ops.getVL(0);

        Operands[1] = Ops.getVL(1);

      }

      TE->setOperands(Operands);

      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))

        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});

      return;

    }

    case Instruction::GetElementPtr: {

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices);

      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";

                 TE->dump());

      TE->setOperands(Operands);


      for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)

        buildTreeRec(Operands[I], Depth + 1, {TE, I});

      return;

    }

    case Instruction::Store: {

      bool Consecutive = CurrentOrder.empty();

      if (!Consecutive)

        fixupOrderingIndices(CurrentOrder);

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices, CurrentOrder);

      if (Consecutive)

        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";

                   TE->dump());

      else

        LLVM_DEBUG(

            dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";

            TE->dump());

      TE->setOperands(Operands);

      buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});

      return;

    }

    case Instruction::Call: {

      // Check if the calls are all to the same vectorizable intrinsic or

      // library function.

      CallInst *CI = cast<CallInst>(VL0);

      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);


      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices);

      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";

                 TE->dump());

      if (isCommutative(VL0)) {

        VLOperands Ops(VL, Operands, S, *this);

        Ops.reorder();

        Operands[0] = Ops.getVL(0);

        Operands[1] = Ops.getVL(1);

      }

      TE->setOperands(Operands);

      for (unsigned I : seq<unsigned>(CI->arg_size())) {

        // For scalar operands no need to create an entry since no need to

        // vectorize it.

        if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))

          continue;

        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});

      }

      return;

    }

    case Instruction::ShuffleVector: {

      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

                                   ReuseShuffleIndices);

      if (S.isAltShuffle()) {

        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";

                   TE->dump());

      } else {

        assert(SLPReVec && "Only supported by REVEC.");

        LLVM_DEBUG(

            dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";

            TE->dump());

      }


      // Reorder operands if reordering would enable vectorization.

      auto *CI = dyn_cast<CmpInst>(VL0);

      if (CI && any_of(VL, [](Value *V) {

            return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();

          })) {

        auto *MainCI = cast<CmpInst>(S.getMainOp());

        auto *AltCI = cast<CmpInst>(S.getAltOp());

        CmpInst::Predicate MainP = MainCI->getPredicate();

        CmpInst::Predicate AltP = AltCI->getPredicate();

        assert(MainP != AltP &&

               "Expected different main/alternate predicates.");

        // Collect operands - commute if it uses the swapped predicate or

        // alternate operation.

        for (auto [Idx, V] : enumerate(VL)) {

          if (isa<PoisonValue>(V))

            continue;

          auto *Cmp = cast<CmpInst>(V);


          if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {

            if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))

              std::swap(Operands.front()[Idx], Operands.back()[Idx]);

          } else {

            if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))

              std::swap(Operands.front()[Idx], Operands.back()[Idx]);

          }

        }

        TE->setOperands(Operands);

        buildTreeRec(Operands.front(), Depth + 1, {TE, 0});

        buildTreeRec(Operands.back(), Depth + 1, {TE, 1});

        return;

      }


      if (isa<BinaryOperator>(VL0) || CI) {

        VLOperands Ops(VL, Operands, S, *this);

        Ops.reorder();

        Operands[0] = Ops.getVL(0);

        Operands[1] = Ops.getVL(1);

      }

      TE->setOperands(Operands);

      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))

        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});

      return;

    }

    default:

      break;

  }

  llvm_unreachable("Unexpected vectorization of the instructions.");

}


unsigned BoUpSLP::canMapToVector(Type *T) const {

  unsigned N = 1;

  Type *EltTy = T;


  while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {

    if (EltTy->isEmptyTy())

      return 0;

    if (auto *ST = dyn_cast<StructType>(EltTy)) {

      // Check that struct is homogeneous.

      for (const auto *Ty : ST->elements())

        if (Ty != *ST->element_begin())

          return 0;

      N *= ST->getNumElements();

      EltTy = *ST->element_begin();

    } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {

      N *= AT->getNumElements();

      EltTy = AT->getElementType();

    } else {

      auto *VT = cast<FixedVectorType>(EltTy);

      N *= VT->getNumElements();

      EltTy = VT->getElementType();

    }

  }


  if (!isValidElementType(EltTy))

    return 0;

  size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));

  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||

      VTSize != DL->getTypeStoreSizeInBits(T))

    return 0;

  return N;

}


bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,

                              SmallVectorImpl<unsigned> &CurrentOrder,

                              bool ResizeAllowed) const {

  const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);

  assert(It != VL.end() && "Expected at least one extract instruction.");

  auto *E0 = cast<Instruction>(*It);

  assert(

      all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&

      "Invalid opcode");

  // Check if all of the extracts come from the same vector and from the

  // correct offset.

  Value *Vec = E0->getOperand(0);


  CurrentOrder.clear();


  // We have to extract from a vector/aggregate with the same number of elements.

  unsigned NElts;

  if (E0->getOpcode() == Instruction::ExtractValue) {

    NElts = canMapToVector(Vec->getType());

    if (!NElts)

      return false;

    // Check if load can be rewritten as load of vector.

    LoadInst *LI = dyn_cast<LoadInst>(Vec);

    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))

      return false;

  } else {

    NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();

  }


  unsigned E = VL.size();

  if (!ResizeAllowed && NElts != E)

    return false;

  SmallVector<int> Indices(E, PoisonMaskElem);

  unsigned MinIdx = NElts, MaxIdx = 0;

  for (auto [I, V] : enumerate(VL)) {

    auto *Inst = dyn_cast<Instruction>(V);

    if (!Inst)

      continue;

    if (Inst->getOperand(0) != Vec)

      return false;

    if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

      if (isa<UndefValue>(EE->getIndexOperand()))

        continue;

    std::optional<unsigned> Idx = getExtractIndex(Inst);

    if (!Idx)

      return false;

    const unsigned ExtIdx = *Idx;

    if (ExtIdx >= NElts)

      continue;

    Indices[I] = ExtIdx;

    if (MinIdx > ExtIdx)

      MinIdx = ExtIdx;

    if (MaxIdx < ExtIdx)

      MaxIdx = ExtIdx;

  }

  if (MaxIdx - MinIdx + 1 > E)

    return false;

  if (MaxIdx + 1 <= E)

    MinIdx = 0;


  // Check that all of the indices extract from the correct offset.

  bool ShouldKeepOrder = true;

  // Assign to all items the initial value E + 1 so we can check if the extract

  // instruction index was used already.

  // Also, later we can check that all the indices are used and we have a

  // consecutive access in the extract instructions, by checking that no

  // element of CurrentOrder still has value E + 1.

  CurrentOrder.assign(E, E);

  for (unsigned I = 0; I < E; ++I) {

    if (Indices[I] == PoisonMaskElem)

      continue;

    const unsigned ExtIdx = Indices[I] - MinIdx;

    if (CurrentOrder[ExtIdx] != E) {

      CurrentOrder.clear();

      return false;

    }

    ShouldKeepOrder &= ExtIdx == I;

    CurrentOrder[ExtIdx] = I;

  }

  if (ShouldKeepOrder)

    CurrentOrder.clear();


  return ShouldKeepOrder;

}


bool BoUpSLP::areAllUsersVectorized(

    Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {

  return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||

         all_of(I->users(), [this](User *U) {

           return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||

                  (isa<ExtractElementInst>(U) && MustGather.contains(U));

         });

}


void BoUpSLP::TreeEntry::buildAltOpShuffleMask(

    const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,

    SmallVectorImpl<Value *> *OpScalars,

    SmallVectorImpl<Value *> *AltScalars) const {

  unsigned Sz = Scalars.size();

  Mask.assign(Sz, PoisonMaskElem);

  SmallVector<int> OrderMask;

  if (!ReorderIndices.empty())

    inversePermutation(ReorderIndices, OrderMask);

  for (unsigned I = 0; I < Sz; ++I) {

    unsigned Idx = I;

    if (!ReorderIndices.empty())

      Idx = OrderMask[I];

    if (isa<PoisonValue>(Scalars[Idx]))

      continue;

    auto *OpInst = cast<Instruction>(Scalars[Idx]);

    if (IsAltOp(OpInst)) {

      Mask[I] = Sz + Idx;

      if (AltScalars)

        AltScalars->push_back(OpInst);

    } else {

      Mask[I] = Idx;

      if (OpScalars)

        OpScalars->push_back(OpInst);

    }

  }

  if (!ReuseShuffleIndices.empty()) {

    SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);

    transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {

      return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;

    });

    Mask.swap(NewMask);

  }

}


static bool isMainInstruction(Instruction *I, Instruction *MainOp,

                              Instruction *AltOp,

                              const TargetLibraryInfo &TLI) {

  return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;

}


static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,

                                   Instruction *AltOp,

                                   const TargetLibraryInfo &TLI) {

  if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {

    auto *AltCI = cast<CmpInst>(AltOp);

    CmpInst::Predicate MainP = MainCI->getPredicate();

    [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();

    assert(MainP != AltP && "Expected different main/alternate predicates.");

    auto *CI = cast<CmpInst>(I);

    if (isCmpSameOrSwapped(MainCI, CI, TLI))

      return false;

    if (isCmpSameOrSwapped(AltCI, CI, TLI))

      return true;

    CmpInst::Predicate P = CI->getPredicate();

    CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);


    assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&

           "CmpInst expected to match either main or alternate predicate or "

           "their swap.");

    return MainP != P && MainP != SwappedP;

  }

  return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;

}


TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {

  assert(!Ops.empty());

  const auto *Op0 = Ops.front();


  const bool IsConstant = all_of(Ops, [](Value *V) {

    // TODO: We should allow undef elements here

    return isConstant(V) && !isa<UndefValue>(V);

  });

  const bool IsUniform = all_of(Ops, [=](Value *V) {

    // TODO: We should allow undef elements here

    return V == Op0;

  });

  const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {

    // TODO: We should allow undef elements here

    if (auto *CI = dyn_cast<ConstantInt>(V))

      return CI->getValue().isPowerOf2();

    return false;

  });

  const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {

    // TODO: We should allow undef elements here

    if (auto *CI = dyn_cast<ConstantInt>(V))

      return CI->getValue().isNegatedPowerOf2();

    return false;

  });


  TTI::OperandValueKind VK = TTI::OK_AnyValue;

  if (IsConstant && IsUniform)

    VK = TTI::OK_UniformConstantValue;

  else if (IsConstant)

    VK = TTI::OK_NonUniformConstantValue;

  else if (IsUniform)

    VK = TTI::OK_UniformValue;


  TTI::OperandValueProperties VP = TTI::OP_None;

  VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;

  VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;


  return {VK, VP};

}


namespace {

/// The base class for shuffle instruction emission and shuffle cost estimation.

class BaseShuffleAnalysis {

protected:

  Type *ScalarTy = nullptr;


  BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}


  /// V is expected to be a vectorized value.

  /// When REVEC is disabled, there is no difference between VF and

  /// VNumElements.

  /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.

  /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead

  /// of 8.

  unsigned getVF(Value *V) const {

    assert(V && "V cannot be nullptr");

    assert(isa<FixedVectorType>(V->getType()) &&

           "V does not have FixedVectorType");

    assert(ScalarTy && "ScalarTy cannot be nullptr");

    unsigned ScalarTyNumElements = getNumElements(ScalarTy);

    unsigned VNumElements =

        cast<FixedVectorType>(V->getType())->getNumElements();

    assert(VNumElements > ScalarTyNumElements &&

           "the number of elements of V is not large enough");

    assert(VNumElements % ScalarTyNumElements == 0 &&

           "the number of elements of V is not a vectorized value");

    return VNumElements / ScalarTyNumElements;

  }


  /// Checks if the mask is an identity mask.

  /// \param IsStrict if is true the function returns false if mask size does

  /// not match vector size.

  static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,

                             bool IsStrict) {

    int Limit = Mask.size();

    int VF = VecTy->getNumElements();

    int Index = -1;

    if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))

      return true;

    if (!IsStrict) {

      // Consider extract subvector starting from index 0.

      if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

          Index == 0)

        return true;

      // All VF-size submasks are identity (e.g.

      // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).

      if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {

            ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);

            return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||

                   ShuffleVectorInst::isIdentityMask(Slice, VF);

          }))

        return true;

    }

    return false;

  }


  /// Tries to combine 2 different masks into single one.

  /// \param LocalVF Vector length of the permuted input vector. \p Mask may

  /// change the size of the vector, \p LocalVF is the original size of the

  /// shuffled vector.

  static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,

                           ArrayRef<int> ExtMask) {

    unsigned VF = Mask.size();

    SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);

    for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {

      if (ExtMask[I] == PoisonMaskElem)

        continue;

      int MaskedIdx = Mask[ExtMask[I] % VF];

      NewMask[I] =

          MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;

    }

    Mask.swap(NewMask);

  }


  /// Looks through shuffles trying to reduce final number of shuffles in the

  /// code. The function looks through the previously emitted shuffle

  /// instructions and properly mark indices in mask as undef.

  /// For example, given the code

  /// \code

  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>

  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>

  /// \endcode

  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will

  /// look through %s1 and %s2 and select vectors %0 and %1 with mask

  /// <0, 1, 2, 3> for the shuffle.

  /// If 2 operands are of different size, the smallest one will be resized and

  /// the mask recalculated properly.

  /// For example, given the code

  /// \code

  /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>

  /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>

  /// \endcode

  /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will

  /// look through %s1 and %s2 and select vectors %0 and %1 with mask

  /// <0, 1, 2, 3> for the shuffle.

  /// So, it tries to transform permutations to simple vector merge, if

  /// possible.

  /// \param V The input vector which must be shuffled using the given \p Mask.

  /// If the better candidate is found, \p V is set to this best candidate

  /// vector.

  /// \param Mask The input mask for the shuffle. If the best candidate is found

  /// during looking-through-shuffles attempt, it is updated accordingly.

  /// \param SinglePermute true if the shuffle operation is originally a

  /// single-value-permutation. In this case the look-through-shuffles procedure

  /// may look for resizing shuffles as the best candidates.

  /// \return true if the shuffle results in the non-resizing identity shuffle

  /// (and thus can be ignored), false - otherwise.

  static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,

                                  bool SinglePermute) {

    Value *Op = V;

    ShuffleVectorInst *IdentityOp = nullptr;

    SmallVector<int> IdentityMask;

    while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {

      // Exit if not a fixed vector type or changing size shuffle.

      auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());

      if (!SVTy)

        break;

      // Remember the identity or broadcast mask, if it is not a resizing

      // shuffle. If no better candidates are found, this Op and Mask will be

      // used in the final shuffle.

      if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {

        if (!IdentityOp || !SinglePermute ||

            (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&

             !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,

                                                    IdentityMask.size()))) {

          IdentityOp = SV;

          // Store current mask in the IdentityMask so later we did not lost

          // this info if IdentityOp is selected as the best candidate for the

          // permutation.

          IdentityMask.assign(Mask);

        }

      }

      // Remember the broadcast mask. If no better candidates are found, this Op

      // and Mask will be used in the final shuffle.

      // Zero splat can be used as identity too, since it might be used with

      // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.

      // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is

      // expensive, the analysis founds out, that the source vector is just a

      // broadcast, this original mask can be transformed to identity mask <0,

      // 1, 2, 3>.

      // \code

      // %0 = shuffle %v, poison, zeroinitalizer

      // %res = shuffle %0, poison, <3, 1, 2, 0>

      // \endcode

      // may be transformed to

      // \code

      // %0 = shuffle %v, poison, zeroinitalizer

      // %res = shuffle %0, poison, <0, 1, 2, 3>

      // \endcode

      if (SV->isZeroEltSplat()) {

        IdentityOp = SV;

        IdentityMask.assign(Mask);

      }

      int LocalVF = Mask.size();

      if (auto *SVOpTy =

              dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))

        LocalVF = SVOpTy->getNumElements();

      SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);

      for (auto [Idx, I] : enumerate(Mask)) {

        if (I == PoisonMaskElem ||

            static_cast<unsigned>(I) >= SV->getShuffleMask().size())

          continue;

        ExtMask[Idx] = SV->getMaskValue(I);

      }

      bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(

                            SV->getOperand(0),

                            buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))

                            .all();

      bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(

                            SV->getOperand(1),

                            buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))

                            .all();

      if (!IsOp1Undef && !IsOp2Undef) {

        // Update mask and mark undef elems.

        for (int &I : Mask) {

          if (I == PoisonMaskElem)

            continue;

          if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==

              PoisonMaskElem)

            I = PoisonMaskElem;

        }

        break;

      }

      SmallVector<int> ShuffleMask(SV->getShuffleMask());

      combineMasks(LocalVF, ShuffleMask, Mask);

      Mask.swap(ShuffleMask);

      if (IsOp2Undef)

        Op = SV->getOperand(0);

      else

        Op = SV->getOperand(1);

    }

    if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());

        !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||

        ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {

      if (IdentityOp) {

        V = IdentityOp;

        assert(Mask.size() == IdentityMask.size() &&

               "Expected masks of same sizes.");

        // Clear known poison elements.

        for (auto [I, Idx] : enumerate(Mask))

          if (Idx == PoisonMaskElem)

            IdentityMask[I] = PoisonMaskElem;

        Mask.swap(IdentityMask);

        auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);

        return SinglePermute &&

               (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),

                               /*IsStrict=*/true) ||

                (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&

                 Shuffle->isZeroEltSplat() &&

                 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size()) &&

                 all_of(enumerate(Mask), [&](const auto &P) {

                   return P.value() == PoisonMaskElem ||

                          Shuffle->getShuffleMask()[P.index()] == 0;

                 })));

      }

      V = Op;

      return false;

    }

    V = Op;

    return true;

  }


  /// Smart shuffle instruction emission, walks through shuffles trees and

  /// tries to find the best matching vector for the actual shuffle

  /// instruction.

  template <typename T, typename ShuffleBuilderTy>

  static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,

                         ShuffleBuilderTy &Builder, Type *ScalarTy) {

    assert(V1 && "Expected at least one vector value.");

    unsigned ScalarTyNumElements = getNumElements(ScalarTy);

    SmallVector<int> NewMask(Mask);

    if (ScalarTyNumElements != 1) {

      assert(SLPReVec && "FixedVectorType is not expected.");

      transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);

      Mask = NewMask;

    }

    if (V2)

      Builder.resizeToMatch(V1, V2);

    int VF = Mask.size();

    if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))

      VF = FTy->getNumElements();

    if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(

                   V2, buildUseMask(VF, Mask, UseMask::SecondArg))

                   .all()) {

      // Peek through shuffles.

      Value *Op1 = V1;

      Value *Op2 = V2;

      int VF =

          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

      SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);

      SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);

      for (int I = 0, E = Mask.size(); I < E; ++I) {

        if (Mask[I] < VF)

          CombinedMask1[I] = Mask[I];

        else

          CombinedMask2[I] = Mask[I] - VF;

      }

      Value *PrevOp1;

      Value *PrevOp2;

      do {

        PrevOp1 = Op1;

        PrevOp2 = Op2;

        (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);

        (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);

        // Check if we have 2 resizing shuffles - need to peek through operands

        // again.

        if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))

          if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {

            SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);

            for (auto [Idx, I] : enumerate(CombinedMask1)) {

                if (I == PoisonMaskElem)

                continue;

                ExtMask1[Idx] = SV1->getMaskValue(I);

            }

            SmallBitVector UseMask1 = buildUseMask(

                cast<FixedVectorType>(SV1->getOperand(1)->getType())

                    ->getNumElements(),

                ExtMask1, UseMask::SecondArg);

            SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);

            for (auto [Idx, I] : enumerate(CombinedMask2)) {

                if (I == PoisonMaskElem)

                continue;

                ExtMask2[Idx] = SV2->getMaskValue(I);

            }

            SmallBitVector UseMask2 = buildUseMask(

                cast<FixedVectorType>(SV2->getOperand(1)->getType())

                    ->getNumElements(),

                ExtMask2, UseMask::SecondArg);

            if (SV1->getOperand(0)->getType() ==

                    SV2->getOperand(0)->getType() &&

                SV1->getOperand(0)->getType() != SV1->getType() &&

                isUndefVector(SV1->getOperand(1), UseMask1).all() &&

                isUndefVector(SV2->getOperand(1), UseMask2).all()) {

              Op1 = SV1->getOperand(0);

              Op2 = SV2->getOperand(0);

              SmallVector<int> ShuffleMask1(SV1->getShuffleMask());

              int LocalVF = ShuffleMask1.size();

              if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))

                LocalVF = FTy->getNumElements();

              combineMasks(LocalVF, ShuffleMask1, CombinedMask1);

              CombinedMask1.swap(ShuffleMask1);

              SmallVector<int> ShuffleMask2(SV2->getShuffleMask());

              LocalVF = ShuffleMask2.size();

              if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))

                LocalVF = FTy->getNumElements();

              combineMasks(LocalVF, ShuffleMask2, CombinedMask2);

              CombinedMask2.swap(ShuffleMask2);

            }

          }

      } while (PrevOp1 != Op1 || PrevOp2 != Op2);

      Builder.resizeToMatch(Op1, Op2);

      VF = std::max(cast<VectorType>(Op1->getType())

                        ->getElementCount()

                        .getKnownMinValue(),

                    cast<VectorType>(Op2->getType())

                        ->getElementCount()

                        .getKnownMinValue());

      for (int I = 0, E = Mask.size(); I < E; ++I) {

        if (CombinedMask2[I] != PoisonMaskElem) {

          assert(CombinedMask1[I] == PoisonMaskElem &&

                 "Expected undefined mask element");

          CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

        }

      }

      if (Op1 == Op2 &&

          (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||

           (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&

            isa<ShuffleVectorInst>(Op1) &&

            cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==

                ArrayRef(CombinedMask1))))

        return Builder.createIdentity(Op1);

      return Builder.createShuffleVector(

          Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,

          CombinedMask1);

    }

    if (isa<PoisonValue>(V1))

      return Builder.createPoison(

          cast<VectorType>(V1->getType())->getElementType(), Mask.size());

    bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);

    assert(V1 && "Expected non-null value after looking through shuffles.");


    if (!IsIdentity)

      return Builder.createShuffleVector(V1, NewMask);

    return Builder.createIdentity(V1);

  }


  /// Transforms mask \p CommonMask per given \p Mask to make proper set after

  /// shuffle emission.

  static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,

                                        ArrayRef<int> Mask) {

    for (unsigned I : seq<unsigned>(CommonMask.size()))

      if (Mask[I] != PoisonMaskElem)

        CommonMask[I] = I;

  }

};

} // namespace


/// Calculate the scalar and the vector costs from vectorizing set of GEPs.

static std::pair<InstructionCost, InstructionCost>

getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,

            Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,

            Type *ScalarTy, VectorType *VecTy) {

  InstructionCost ScalarCost = 0;

  InstructionCost VecCost = 0;

  // Here we differentiate two cases: (1) when Ptrs represent a regular

  // vectorization tree node (as they are pointer arguments of scattered

  // loads) or (2) when Ptrs are the arguments of loads or stores being

  // vectorized as plane wide unit-stride load/store since all the

  // loads/stores are known to be from/to adjacent locations.

  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {

    // Case 2: estimate costs for pointer related costs when vectorizing to

    // a wide load/store.

    // Scalar cost is estimated as a set of pointers with known relationship

    // between them.

    // For vector code we will use BasePtr as argument for the wide load/store

    // but we also need to account all the instructions which are going to

    // stay in vectorized code due to uses outside of these scalar

    // loads/stores.

    ScalarCost = TTI.getPointersChainCost(

        Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,

        CostKind);


    SmallVector<const Value *> PtrsRetainedInVecCode;

    for (Value *V : Ptrs) {

      if (V == BasePtr) {

        PtrsRetainedInVecCode.push_back(V);

        continue;

      }

      auto *Ptr = dyn_cast<GetElementPtrInst>(V);

      // For simplicity assume Ptr to stay in vectorized code if it's not a

      // GEP instruction. We don't care since it's cost considered free.

      // TODO: We should check for any uses outside of vectorizable tree

      // rather than just single use.

      if (!Ptr || !Ptr->hasOneUse())

        PtrsRetainedInVecCode.push_back(V);

    }


    if (PtrsRetainedInVecCode.size() == Ptrs.size()) {

      // If all pointers stay in vectorized code then we don't have

      // any savings on that.

      return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);

    }

    VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,

                                       TTI::PointersChainInfo::getKnownStride(),

                                       VecTy, CostKind);

  } else {

    // Case 1: Ptrs are the arguments of loads that we are going to transform

    // into masked gather load intrinsic.

    // All the scalar GEPs will be removed as a result of vectorization.

    // For any external uses of some lanes extract element instructions will

    // be generated (which cost is estimated separately).

    TTI::PointersChainInfo PtrsInfo =

        all_of(Ptrs,

               [](const Value *V) {

                 auto *Ptr = dyn_cast<GetElementPtrInst>(V);

                 return Ptr && !Ptr->hasAllConstantIndices();

               })

            ? TTI::PointersChainInfo::getUnknownStride()

            : TTI::PointersChainInfo::getKnownStride();


    ScalarCost =

        TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);

    auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);

    if (!BaseGEP) {

      auto *It = find_if(Ptrs, IsaPred<GEPOperator>);

      if (It != Ptrs.end())

        BaseGEP = cast<GEPOperator>(*It);

    }

    if (BaseGEP) {

      SmallVector<const Value *> Indices(BaseGEP->indices());

      VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),

                               BaseGEP->getPointerOperand(), Indices, VecTy,

                               CostKind);

    }

  }


  return std::make_pair(ScalarCost, VecCost);

}


void BoUpSLP::reorderGatherNode(TreeEntry &TE) {

  assert(TE.isGather() && TE.ReorderIndices.empty() &&

         "Expected gather node without reordering.");

  DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;

  SmallSet<size_t, 2> LoadKeyUsed;


  // Do not reorder nodes if it small (just 2 elements), all-constant or all

  // instructions have same opcode already.

  if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||

      all_of(TE.Scalars, isConstant))

    return;


  if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {

        return VectorizableTree[Idx]->isSame(TE.Scalars);

      }))

    return;


  auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {

    Key = hash_combine(hash_value(LI->getParent()), Key);

    Value *Ptr =

        getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);

    if (LoadKeyUsed.contains(Key)) {

      auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));

      if (LIt != LoadsMap.end()) {

        for (LoadInst *RLI : LIt->second) {

          if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

                              LI->getType(), LI->getPointerOperand(), *DL, *SE,

                              /*StrictCheck=*/true))

            return hash_value(RLI->getPointerOperand());

        }

        for (LoadInst *RLI : LIt->second) {

          if (arePointersCompatible(RLI->getPointerOperand(),

                                    LI->getPointerOperand(), *TLI)) {

            hash_code SubKey = hash_value(RLI->getPointerOperand());

            return SubKey;

          }

        }

        if (LIt->second.size() > 2) {

          hash_code SubKey =

              hash_value(LIt->second.back()->getPointerOperand());

          return SubKey;

        }

      }

    }

    LoadKeyUsed.insert(Key);

    LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);

    return hash_value(LI->getPointerOperand());

  };

  MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;

  SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;

  bool IsOrdered = true;

  unsigned NumInstructions = 0;

  // Try to "cluster" scalar instructions, to be able to build extra vectorized

  // nodes.

  for (auto [I, V] : enumerate(TE.Scalars)) {

    size_t Key = 1, Idx = 1;

    if (auto *Inst = dyn_cast<Instruction>(V);

        Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&

        !isDeleted(Inst) && !isVectorized(V)) {

      std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,

                                             /*AllowAlternate=*/false);

      ++NumInstructions;

    }

    auto &Container = SortedValues[Key];

    if (IsOrdered && !KeyToIndex.contains(V) &&

        !(isa<Constant, ExtractElementInst>(V) ||

          isVectorLikeInstWithConstOps(V)) &&

        ((Container.contains(Idx) &&

          KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||

         (!Container.empty() && !Container.contains(Idx) &&

          KeyToIndex.at(Container.back().second.back()).back() != I - 1)))

      IsOrdered = false;

    auto &KTI = KeyToIndex[V];

    if (KTI.empty())

      Container[Idx].push_back(V);

    KTI.push_back(I);

  }

  SmallVector<std::pair<unsigned, unsigned>> SubVectors;

  APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());

  if (!IsOrdered && NumInstructions > 1) {

    unsigned Cnt = 0;

    TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());

    for (const auto &D : SortedValues) {

      for (const auto &P : D.second) {

        unsigned Sz = 0;

        for (Value *V : P.second) {

          ArrayRef<unsigned> Indices = KeyToIndex.at(V);

          for (auto [K, Idx] : enumerate(Indices)) {

            TE.ReorderIndices[Cnt + K] = Idx;

            TE.Scalars[Cnt + K] = V;

          }

          Sz += Indices.size();

          Cnt += Indices.size();

        }

        if (Sz > 1 && isa<Instruction>(P.second.front())) {

          const unsigned SubVF = getFloorFullVectorNumberOfElements(

              *TTI, TE.Scalars.front()->getType(), Sz);

          SubVectors.emplace_back(Cnt - Sz, SubVF);

          for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))

            DemandedElts.clearBit(I);

        } else if (!P.second.empty() && isConstant(P.second.front())) {

          for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))

            DemandedElts.clearBit(I);

        }

      }

    }

  }

  // Reuses always require shuffles, so consider it as profitable.

  if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())

    return;

  // Do simple cost estimation.

  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  InstructionCost Cost = 0;

  auto *ScalarTy = TE.Scalars.front()->getType();

  auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());

  for (auto [Idx, Sz] : SubVectors) {

    Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,

                             Idx, getWidenedType(ScalarTy, Sz));

  }

  Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,

                                   /*Insert=*/true,

                                   /*Extract=*/false, CostKind);

  int Sz = TE.Scalars.size();

  SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),

                               TE.ReorderIndices.end());

  for (unsigned I : seq<unsigned>(Sz)) {

    Value *V = TE.getOrdered(I);

    if (isa<PoisonValue>(V)) {

      ReorderMask[I] = PoisonMaskElem;

    } else if (isConstant(V) || DemandedElts[I]) {

      ReorderMask[I] = I + TE.ReorderIndices.size();

    }

  }

  Cost += ::getShuffleCost(*TTI,

                           any_of(ReorderMask, [&](int I) { return I >= Sz; })

                               ? TTI::SK_PermuteTwoSrc

                               : TTI::SK_PermuteSingleSrc,

                           VecTy, ReorderMask);

  DemandedElts = APInt::getAllOnes(TE.Scalars.size());

  ReorderMask.assign(Sz, PoisonMaskElem);

  for (unsigned I : seq<unsigned>(Sz)) {

    Value *V = TE.getOrdered(I);

    if (isConstant(V)) {

      DemandedElts.clearBit(I);

      if (!isa<PoisonValue>(V))

        ReorderMask[I] = I;

    } else {

      ReorderMask[I] = I + Sz;

    }

  }

  InstructionCost BVCost =

      getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,

                               /*Insert=*/true, /*Extract=*/false, CostKind);

  if (!DemandedElts.isAllOnes())

    BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);

  if (Cost >= BVCost) {

    SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());

    reorderScalars(TE.Scalars, Mask);

    TE.ReorderIndices.clear();

  }

}


/// Check if we can convert fadd/fsub sequence to FMAD.

/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.

static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,

                                       const InstructionsState &S,

                                       DominatorTree &DT, const DataLayout &DL,

                                       TargetTransformInfo &TTI,

                                       const TargetLibraryInfo &TLI) {

  assert(all_of(VL,

                [](Value *V) {

                  return V->getType()->getScalarType()->isFloatingPointTy();

                }) &&

         "Can only convert to FMA for floating point types");

  assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");


  auto CheckForContractable = [&](ArrayRef<Value *> VL) {

    FastMathFlags FMF;

    FMF.set();

    for (Value *V : VL) {

      auto *I = dyn_cast<Instruction>(V);

      if (!I)

        continue;

      if (S.isCopyableElement(I))

        continue;

      Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);

      if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)

        continue;

      if (auto *FPCI = dyn_cast<FPMathOperator>(I))

        FMF &= FPCI->getFastMathFlags();

    }

    return FMF.allowContract();

  };

  if (!CheckForContractable(VL))

    return InstructionCost::getInvalid();

  // fmul also should be contractable

  InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);

  SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);


  InstructionsState OpS = getSameOpcode(Operands.front(), TLI);

  if (!OpS.valid())

    return InstructionCost::getInvalid();


  if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)

    return InstructionCost::getInvalid();

  if (!CheckForContractable(Operands.front()))

    return InstructionCost::getInvalid();

  // Compare the costs.

  InstructionCost FMulPlusFAddCost = 0;

  InstructionCost FMACost = 0;

  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  FastMathFlags FMF;

  FMF.set();

  for (Value *V : VL) {

    auto *I = dyn_cast<Instruction>(V);

    if (!I)

      continue;

    if (!S.isCopyableElement(I))

      if (auto *FPCI = dyn_cast<FPMathOperator>(I))

        FMF &= FPCI->getFastMathFlags();

    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);

  }

  unsigned NumOps = 0;

  for (auto [V, Op] : zip(VL, Operands.front())) {

    if (S.isCopyableElement(V))

      continue;

    auto *I = dyn_cast<Instruction>(Op);

    if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {

      if (auto *OpI = dyn_cast<Instruction>(V))

        FMACost += TTI.getInstructionCost(OpI, CostKind);

      if (I)

        FMACost += TTI.getInstructionCost(I, CostKind);

      continue;

    }

    ++NumOps;

    if (auto *FPCI = dyn_cast<FPMathOperator>(I))

      FMF &= FPCI->getFastMathFlags();

    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);

  }

  Type *Ty = VL.front()->getType();

  IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);

  FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);

  return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();

}


void BoUpSLP::transformNodes() {

  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  BaseGraphSize = VectorizableTree.size();

  // Turn graph transforming mode on and off, when done.

  class GraphTransformModeRAAI {

    bool &SavedIsGraphTransformMode;


  public:

    GraphTransformModeRAAI(bool &IsGraphTransformMode)

        : SavedIsGraphTransformMode(IsGraphTransformMode) {

      IsGraphTransformMode = true;

    }

    ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }

  } TransformContext(IsGraphTransformMode);

  // Operands are profitable if they are:

  // 1. At least one constant

  // or

  // 2. Splats

  // or

  // 3. Results in good vectorization opportunity, i.e. may generate vector

  // nodes and reduce cost of the graph.

  auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,

                                           const InstructionsState &S) {

    SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

    for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))

      Candidates.emplace_back().emplace_back(I1->getOperand(Op),

                                             I2->getOperand(Op));

    return all_of(

        Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

          return all_of(Cand,

                        [](const std::pair<Value *, Value *> &P) {

                          return isa<Constant>(P.first) ||

                                 isa<Constant>(P.second) || P.first == P.second;

                        }) ||

                 findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);

        });

  };


  // Try to reorder gather nodes for better vectorization opportunities.

  for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {

    TreeEntry &E = *VectorizableTree[Idx];

    if (E.isGather())

      reorderGatherNode(E);

  }


  // Better to use full gathered loads analysis, if there are only 2 loads

  // gathered nodes each having less than 16 elements.

  constexpr unsigned VFLimit = 16;

  bool ForceLoadGather =

      count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

        return TE->isGather() && TE->hasState() &&

               TE->getOpcode() == Instruction::Load &&

               TE->getVectorFactor() < VFLimit;

      }) == 2;


  // Checks if the scalars are used in other node.

  auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,

                              function_ref<bool(Value *)> CheckContainer) {

    return TE->isSame(VL) || all_of(VL, [&](Value *V) {

             if (isa<PoisonValue>(V))

               return true;

             auto *I = dyn_cast<Instruction>(V);

             if (!I)

               return false;

             return is_contained(TE->Scalars, I) || CheckContainer(I);

           });

  };

  auto CheckForSameVectorNodes = [&](const TreeEntry &E) {

    if (E.hasState()) {

      if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());

          !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {

            return AreReusedScalars(TE, E.Scalars, [&](Value *V) {

              ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);

              return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {

                return is_contained(TEs, TE);

              });

            });

          }))

        return true;

      ;

      if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());

          !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {

            return AreReusedScalars(TE, E.Scalars, [&](Value *V) {

              ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);

              return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {

                return is_contained(TEs, TE);

              });

            });

          }))

        return true;

    } else {

      // Check if the gather node full copy of split node.

      auto *It = find_if(E.Scalars, IsaPred<Instruction>);

      if (It != E.Scalars.end()) {

        if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);

            !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {

              return AreReusedScalars(TE, E.Scalars, [&](Value *V) {

                ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);

                return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {

                  return is_contained(TEs, TE);

                });

              });

            }))

          return true;

      }

    }

    return false;

  };

  // The tree may grow here, so iterate over nodes, built before.

  for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {

    TreeEntry &E = *VectorizableTree[Idx];

    if (E.isGather()) {

      ArrayRef<Value *> VL = E.Scalars;

      const unsigned Sz = getVectorElementSize(VL.front());

      unsigned MinVF = getMinVF(2 * Sz);

      // Do not try partial vectorization for small nodes (<= 2), nodes with the

      // same opcode and same parent block or all constants.

      if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||

          !(!E.hasState() || E.getOpcode() == Instruction::Load ||

            // We use allSameOpcode instead of isAltShuffle because we don't

            // want to use interchangeable instruction here.

            !allSameOpcode(VL) || !allSameBlock(VL)) ||

          allConstant(VL) || isSplat(VL))

        continue;

      if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)

        continue;

      // Check if the node is a copy of other vector nodes.

      if (CheckForSameVectorNodes(E))

        continue;

      // Try to find vectorizable sequences and transform them into a series of

      // insertvector instructions.

      unsigned StartIdx = 0;

      unsigned End = VL.size();

      for (unsigned VF = getFloorFullVectorNumberOfElements(

               *TTI, VL.front()->getType(), VL.size() - 1);

           VF >= MinVF; VF = getFloorFullVectorNumberOfElements(

                            *TTI, VL.front()->getType(), VF - 1)) {

        if (StartIdx + VF > End)

          continue;

        SmallVector<std::pair<unsigned, unsigned>> Slices;

        bool AllStrided = true;

        for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {

          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

          // If any instruction is vectorized already - do not try again.

          // Reuse the existing node, if it fully matches the slice.

          if (isVectorized(Slice.front()) &&

              !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))

            continue;

          // Constant already handled effectively - skip.

          if (allConstant(Slice))

            continue;

          // Do not try to vectorize small splats (less than vector register and

          // only with the single non-undef element).

          bool IsSplat = isSplat(Slice);

          bool IsTwoRegisterSplat = true;

          if (IsSplat && VF == 2) {

            unsigned NumRegs2VF = ::getNumberOfParts(

                *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));

            IsTwoRegisterSplat = NumRegs2VF == 2;

          }

          if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||

              count(Slice, Slice.front()) ==

                  static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1

                                                                   : 1)) {

            if (IsSplat)

              continue;

            InstructionsState S = getSameOpcode(Slice, *TLI);

            if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||

                (S.getOpcode() == Instruction::Load &&

                 areKnownNonVectorizableLoads(Slice)) ||

                (S.getOpcode() != Instruction::Load &&

                 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))

              continue;

            if (VF == 2) {

              // Try to vectorize reduced values or if all users are vectorized.

              // For expensive instructions extra extracts might be profitable.

              if ((!UserIgnoreList || E.Idx != 0) &&

                  TTI->getInstructionCost(S.getMainOp(), CostKind) <

                      TTI::TCC_Expensive &&

                  !all_of(Slice, [&](Value *V) {

                    if (isa<PoisonValue>(V))

                      return true;

                    return areAllUsersVectorized(cast<Instruction>(V),

                                                 UserIgnoreList);

                  }))

                continue;

              if (S.getOpcode() == Instruction::Load) {

                OrdersType Order;

                SmallVector<Value *> PointerOps;

                LoadsState Res =

                    canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);

                AllStrided &= Res == LoadsState::StridedVectorize ||

                              Res == LoadsState::ScatterVectorize ||

                              Res == LoadsState::Gather;

                // Do not vectorize gathers.

                if (Res == LoadsState::ScatterVectorize ||

                    Res == LoadsState::Gather) {

                  if (Res == LoadsState::Gather) {

                    registerNonVectorizableLoads(Slice);

                    // If reductions and the scalars from the root node are

                    // analyzed - mark as non-vectorizable reduction.

                    if (UserIgnoreList && E.Idx == 0)

                      analyzedReductionVals(Slice);

                  }

                  continue;

                }

              } else if (S.getOpcode() == Instruction::ExtractElement ||

                         (TTI->getInstructionCost(S.getMainOp(), CostKind) <

                              TTI::TCC_Expensive &&

                          !CheckOperandsProfitability(

                              S.getMainOp(),

                              cast<Instruction>(*find_if(reverse(Slice),

                                                         IsaPred<Instruction>)),

                              S))) {

                // Do not vectorize extractelements (handled effectively

                // alread). Do not vectorize non-profitable instructions (with

                // low cost and non-vectorizable operands.)

                continue;

              }

            }

          }

          Slices.emplace_back(Cnt, Slice.size());

        }

        // Do not try to vectorize if all slides are strided or gathered with

        // vector factor 2 and there are more than 2 slices. Better to handle

        // them in gathered loads analysis, may result in better vectorization.

        if (VF == 2 && AllStrided && Slices.size() > 2)

          continue;

        auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {

          E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);

          if (StartIdx == Cnt)

            StartIdx = Cnt + Sz;

          if (End == Cnt + Sz)

            End = Cnt;

        };

        for (auto [Cnt, Sz] : Slices) {

          ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);

          const TreeEntry *SameTE = nullptr;

          if (const auto *It = find_if(Slice, IsaPred<Instruction>);

              It != Slice.end()) {

            // If any instruction is vectorized already - do not try again.

            SameTE = getSameValuesTreeEntry(*It, Slice);

          }

          unsigned PrevSize = VectorizableTree.size();

          [[maybe_unused]] unsigned PrevEntriesSize =

              LoadEntriesToVectorize.size();

          buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));

          if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&

              VectorizableTree[PrevSize]->isGather() &&

              VectorizableTree[PrevSize]->hasState() &&

              VectorizableTree[PrevSize]->getOpcode() !=

                  Instruction::ExtractElement &&

              !isSplat(Slice)) {

            if (UserIgnoreList && E.Idx == 0 && VF == 2)

              analyzedReductionVals(Slice);

            VectorizableTree.pop_back();

            assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&

                   "LoadEntriesToVectorize expected to remain the same");

            continue;

          }

          AddCombinedNode(PrevSize, Cnt, Sz);

        }

      }

      // Restore ordering, if no extra vectorization happened.

      if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {

        SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());

        reorderScalars(E.Scalars, Mask);

        E.ReorderIndices.clear();

      }

    }

    if (!E.hasState())

      continue;

    switch (E.getOpcode()) {

    case Instruction::Load: {

      // No need to reorder masked gather loads, just reorder the scalar

      // operands.

      if (E.State != TreeEntry::Vectorize)

        break;

      Type *ScalarTy = E.getMainOp()->getType();

      auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());

      Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);

      // Check if profitable to represent consecutive load + reverse as strided

      // load with stride -1.

      if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&

          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {

        SmallVector<int> Mask;

        inversePermutation(E.ReorderIndices, Mask);

        auto *BaseLI = cast<LoadInst>(E.Scalars.back());

        InstructionCost OriginalVecCost =

            TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),

                                 BaseLI->getPointerAddressSpace(), CostKind,

                                 TTI::OperandValueInfo()) +

            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);

        InstructionCost StridedCost = TTI->getStridedMemoryOpCost(

            Instruction::Load, VecTy, BaseLI->getPointerOperand(),

            /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);

        if (StridedCost < OriginalVecCost)

          // Strided load is more profitable than consecutive load + reverse -

          // transform the node to strided load.

          E.State = TreeEntry::StridedVectorize;

      }

      break;

    }

    case Instruction::Store: {

      Type *ScalarTy =

          cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();

      auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());

      Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);

      // Check if profitable to represent consecutive load + reverse as strided

      // load with stride -1.

      if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&

          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {

        SmallVector<int> Mask;

        inversePermutation(E.ReorderIndices, Mask);

        auto *BaseSI = cast<StoreInst>(E.Scalars.back());

        InstructionCost OriginalVecCost =

            TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),

                                 BaseSI->getPointerAddressSpace(), CostKind,

                                 TTI::OperandValueInfo()) +

            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);

        InstructionCost StridedCost = TTI->getStridedMemoryOpCost(

            Instruction::Store, VecTy, BaseSI->getPointerOperand(),

            /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);

        if (StridedCost < OriginalVecCost)

          // Strided store is more profitable than reverse + consecutive store -

          // transform the node to strided store.

          E.State = TreeEntry::StridedVectorize;

      } else if (!E.ReorderIndices.empty()) {

        // Check for interleaved stores.

        auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {

          auto *BaseSI = cast<StoreInst>(E.Scalars.front());

          assert(Mask.size() > 1 && "Expected mask greater than 1 element.");

          if (Mask.size() < 4)

            return 0u;

          for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {

            if (ShuffleVectorInst::isInterleaveMask(

                    Mask, Factor, VecTy->getElementCount().getFixedValue()) &&

                TTI.isLegalInterleavedAccessType(

                    VecTy, Factor, BaseSI->getAlign(),

                    BaseSI->getPointerAddressSpace()))

              return Factor;

          }


          return 0u;

        };

        SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());

        unsigned InterleaveFactor = IsInterleaveMask(Mask);

        if (InterleaveFactor != 0)

          E.setInterleave(InterleaveFactor);

      }

      break;

    }

    case Instruction::Select: {

      if (E.State != TreeEntry::Vectorize)

        break;

      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);

      if (MinMaxID == Intrinsic::not_intrinsic)

        break;

      // This node is a minmax node.

      E.CombinedOp = TreeEntry::MinMax;

      TreeEntry *CondEntry = getOperandEntry(&E, 0);

      if (SelectOnly && CondEntry->UserTreeIndex &&

          CondEntry->State == TreeEntry::Vectorize) {

        // The condition node is part of the combined minmax node.

        CondEntry->State = TreeEntry::CombinedVectorize;

      }

      break;

    }

    case Instruction::FSub:

    case Instruction::FAdd: {

      // Check if possible to convert (a*b)+c to fma.

      if (E.State != TreeEntry::Vectorize ||

          !E.getOperations().isAddSubLikeOp())

        break;

      if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)

               .isValid())

        break;

      // This node is a fmuladd node.

      E.CombinedOp = TreeEntry::FMulAdd;

      TreeEntry *FMulEntry = getOperandEntry(&E, 0);

      if (FMulEntry->UserTreeIndex &&

          FMulEntry->State == TreeEntry::Vectorize) {

        // The FMul node is part of the combined fmuladd node.

        FMulEntry->State = TreeEntry::CombinedVectorize;

      }

      break;

    }

    default:

      break;

    }

  }


  if (LoadEntriesToVectorize.empty()) {

    // Single load node - exit.

    if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&

        VectorizableTree.front()->getOpcode() == Instruction::Load)

      return;

    // Small graph with small VF - exit.

    constexpr unsigned SmallTree = 3;

    constexpr unsigned SmallVF = 2;

    if ((VectorizableTree.size() <= SmallTree &&

         VectorizableTree.front()->Scalars.size() == SmallVF) ||

        (VectorizableTree.size() <= 2 && UserIgnoreList))

      return;


    if (VectorizableTree.front()->isNonPowOf2Vec() &&

        getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&

        getCanonicalGraphSize() <= SmallTree &&

        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),

                 [](const std::unique_ptr<TreeEntry> &TE) {

                   return TE->isGather() && TE->hasState() &&

                          TE->getOpcode() == Instruction::Load &&

                          !allSameBlock(TE->Scalars);

                 }) == 1)

      return;

  }


  // A list of loads to be gathered during the vectorization process. We can

  // try to vectorize them at the end, if profitable.

  SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,

                 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>

      GatheredLoads;


  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

    TreeEntry &E = *TE;

    if (E.isGather() &&

        ((E.hasState() && E.getOpcode() == Instruction::Load) ||

         (!E.hasState() && any_of(E.Scalars,

                                  [&](Value *V) {

                                    return isa<LoadInst>(V) &&

                                           !isVectorized(V) &&

                                           !isDeleted(cast<Instruction>(V));

                                  }))) &&

        !isSplat(E.Scalars)) {

      for (Value *V : E.Scalars) {

        auto *LI = dyn_cast<LoadInst>(V);

        if (!LI)

          continue;

        if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())

          continue;

        gatherPossiblyVectorizableLoads(

            *this, V, *DL, *SE, *TTI,

            GatheredLoads[std::make_tuple(

                LI->getParent(),

                getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),

                LI->getType())]);

      }

    }

  }

  // Try to vectorize gathered loads if this is not just a gather of loads.

  if (!GatheredLoads.empty())

    tryToVectorizeGatheredLoads(GatheredLoads);

}


/// Merges shuffle masks and emits final shuffle instruction, if required. It

/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,

/// when the actual shuffle instruction is generated only if this is actually

/// required. Otherwise, the shuffle instruction emission is delayed till the

/// end of the process, to reduce the number of emitted instructions and further

/// analysis/transformations.

class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {

  bool IsFinalized = false;

  SmallVector<int> CommonMask;

  SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;

  const TargetTransformInfo &TTI;

  InstructionCost Cost = 0;

  SmallDenseSet<Value *> VectorizedVals;

  BoUpSLP &R;

  SmallPtrSetImpl<Value *> &CheckedExtracts;

  constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  /// While set, still trying to estimate the cost for the same nodes and we

  /// can delay actual cost estimation (virtual shuffle instruction emission).

  /// May help better estimate the cost if same nodes must be permuted + allows

  /// to move most of the long shuffles cost estimation to TTI.

  bool SameNodesEstimated = true;


  static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {

    if (Ty->getScalarType()->isPointerTy()) {

      Constant *Res = ConstantExpr::getIntToPtr(

          ConstantInt::getAllOnesValue(

              IntegerType::get(Ty->getContext(),

                               DL.getTypeStoreSizeInBits(Ty->getScalarType()))),

          Ty->getScalarType());

      if (auto *VTy = dyn_cast<VectorType>(Ty))

        Res = ConstantVector::getSplat(VTy->getElementCount(), Res);

      return Res;

    }

    return Constant::getAllOnesValue(Ty);

  }


  InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {

    if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))

      return TTI::TCC_Free;

    auto *VecTy = getWidenedType(ScalarTy, VL.size());

    InstructionCost GatherCost = 0;

    SmallVector<Value *> Gathers(VL);

    if (!Root && isSplat(VL)) {

      // Found the broadcasting of the single scalar, calculate the cost as

      // the broadcast.

      const auto *It = find_if_not(VL, IsaPred<UndefValue>);

      assert(It != VL.end() && "Expected at least one non-undef value.");

      // Add broadcast for non-identity shuffle only.

      bool NeedShuffle =

          count(VL, *It) > 1 &&

          (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));

      if (!NeedShuffle) {

        if (isa<FixedVectorType>(ScalarTy)) {

          assert(SLPReVec && "FixedVectorType is not expected.");

          return TTI.getShuffleCost(

              TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,

              std::distance(VL.begin(), It) * getNumElements(ScalarTy),

              cast<FixedVectorType>(ScalarTy));

        }

        return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,

                                      CostKind, std::distance(VL.begin(), It),

                                      PoisonValue::get(VecTy), *It);

      }


      SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);

      transform(VL, ShuffleMask.begin(), [](Value *V) {

        return isa<PoisonValue>(V) ? PoisonMaskElem : 0;

      });

      InstructionCost InsertCost =

          TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,

                                 PoisonValue::get(VecTy), *It);

      return InsertCost + ::getShuffleCost(TTI,

                                           TargetTransformInfo::SK_Broadcast,

                                           VecTy, ShuffleMask, CostKind,

                                           /*Index=*/0, /*SubTp=*/nullptr,

                                           /*Args=*/*It);

    }

    return GatherCost +

           (all_of(Gathers, IsaPred<UndefValue>)

                ? TTI::TCC_Free

                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),

                                  ScalarTy));

  };


  /// Compute the cost of creating a vector containing the extracted values from

  /// \p VL.

  InstructionCost

  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,

                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

                     unsigned NumParts) {

    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");

    unsigned NumElts =

        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {

          auto *EE = dyn_cast<ExtractElementInst>(V);

          if (!EE)

            return Sz;

          auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());

          if (!VecTy)

            return Sz;

          return std::max(Sz, VecTy->getNumElements());

        });

    // FIXME: this must be moved to TTI for better estimation.

    unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);

    auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,

                                        SmallVectorImpl<unsigned> &Indices,

                                        SmallVectorImpl<unsigned> &SubVecSizes)

        -> std::optional<TTI::ShuffleKind> {

      if (NumElts <= EltsPerVector)

        return std::nullopt;

      int OffsetReg0 =

          alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,

                                    [](int S, int I) {

                                      if (I == PoisonMaskElem)

                                        return S;

                                      return std::min(S, I);

                                    }),

                    EltsPerVector);

      int OffsetReg1 = OffsetReg0;

      DenseSet<int> RegIndices;

      // Check that if trying to permute same single/2 input vectors.

      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;

      int FirstRegId = -1;

      Indices.assign(1, OffsetReg0);

      for (auto [Pos, I] : enumerate(Mask)) {

        if (I == PoisonMaskElem)

          continue;

        int Idx = I - OffsetReg0;

        int RegId =

            (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;

        if (FirstRegId < 0)

          FirstRegId = RegId;

        RegIndices.insert(RegId);

        if (RegIndices.size() > 2)

          return std::nullopt;

        if (RegIndices.size() == 2) {

          ShuffleKind = TTI::SK_PermuteTwoSrc;

          if (Indices.size() == 1) {

            OffsetReg1 = alignDown(

                std::accumulate(

                    std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,

                    [&](int S, int I) {

                      if (I == PoisonMaskElem)

                        return S;

                      int RegId = ((I - OffsetReg0) / NumElts) * NumParts +

                                  ((I - OffsetReg0) % NumElts) / EltsPerVector;

                      if (RegId == FirstRegId)

                        return S;

                      return std::min(S, I);

                    }),

                EltsPerVector);

            unsigned Index = OffsetReg1 % NumElts;

            Indices.push_back(Index);

            SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));

          }

          Idx = I - OffsetReg1;

        }

        I = (Idx % NumElts) % EltsPerVector +

            (RegId == FirstRegId ? 0 : EltsPerVector);

      }

      return ShuffleKind;

    };

    InstructionCost Cost = 0;


    // Process extracts in blocks of EltsPerVector to check if the source vector

    // operand can be re-used directly. If not, add the cost of creating a

    // shuffle to extract the values into a vector register.

    for (unsigned Part : seq<unsigned>(NumParts)) {

      if (!ShuffleKinds[Part])

        continue;

      ArrayRef<int> MaskSlice = Mask.slice(

          Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));

      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);

      copy(MaskSlice, SubMask.begin());

      SmallVector<unsigned, 2> Indices;

      SmallVector<unsigned, 2> SubVecSizes;

      std::optional<TTI::ShuffleKind> RegShuffleKind =

          CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);

      if (!RegShuffleKind) {

        if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||

            !ShuffleVectorInst::isIdentityMask(

                MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))

          Cost +=

              ::getShuffleCost(TTI, *ShuffleKinds[Part],

                               getWidenedType(ScalarTy, NumElts), MaskSlice);

        continue;

      }

      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||

          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {

        Cost +=

            ::getShuffleCost(TTI, *RegShuffleKind,

                             getWidenedType(ScalarTy, EltsPerVector), SubMask);

      }

      const unsigned BaseVF = getFullVectorNumberOfElements(

          *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));

      for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {

        assert((Idx + SubVecSize) <= BaseVF &&

               "SK_ExtractSubvector index out of range");

        Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,

                                 getWidenedType(ScalarTy, BaseVF), {}, CostKind,

                                 Idx, getWidenedType(ScalarTy, SubVecSize));

      }

      // Second attempt to check, if just a permute is better estimated than

      // subvector extract.

      SubMask.assign(NumElts, PoisonMaskElem);

      copy(MaskSlice, SubMask.begin());

      InstructionCost OriginalCost = ::getShuffleCost(

          TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);

      if (OriginalCost < Cost)

        Cost = OriginalCost;

    }

    return Cost;

  }

  /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given

  /// mask \p Mask, register number \p Part, that includes \p SliceSize

  /// elements.

  void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,

                                ArrayRef<int> Mask, unsigned Part,

                                unsigned SliceSize) {

    if (SameNodesEstimated) {

      // Delay the cost estimation if the same nodes are reshuffling.

      // If we already requested the cost of reshuffling of E1 and E2 before, no

      // need to estimate another cost with the sub-Mask, instead include this

      // sub-Mask into the CommonMask to estimate it later and avoid double cost

      // estimation.

      if ((InVectors.size() == 2 &&

           cast<const TreeEntry *>(InVectors.front()) == &E1 &&

           cast<const TreeEntry *>(InVectors.back()) == E2) ||

          (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {

        unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);

        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),

                      [](int Idx) { return Idx == PoisonMaskElem; }) &&

               "Expected all poisoned elements.");

        ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);

        copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));

        return;

      }

      // Found non-matching nodes - need to estimate the cost for the matched

      // and transform mask.

      Cost += createShuffle(InVectors.front(),

                            InVectors.size() == 1 ? nullptr : InVectors.back(),

                            CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

    } else if (InVectors.size() == 2) {

      Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

    }

    SameNodesEstimated = false;

    if (!E2 && InVectors.size() == 1) {

      unsigned VF = E1.getVectorFactor();

      if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {

        VF = std::max(VF, getVF(V1));

      } else {

        const auto *E = cast<const TreeEntry *>(InVectors.front());

        VF = std::max(VF, E->getVectorFactor());

      }

      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

        if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)

          CommonMask[Idx] = Mask[Idx] + VF;

      Cost += createShuffle(InVectors.front(), &E1, CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

    } else {

      auto P = InVectors.front();

      Cost += createShuffle(&E1, E2, Mask);

      unsigned VF = Mask.size();

      if (Value *V1 = dyn_cast<Value *>(P)) {

        VF = std::max(VF,

                      getNumElements(V1->getType()));

      } else {

        const auto *E = cast<const TreeEntry *>(P);

        VF = std::max(VF, E->getVectorFactor());

      }

      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

        if (Mask[Idx] != PoisonMaskElem)

          CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);

      Cost += createShuffle(P, InVectors.front(), CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

    }

  }


  class ShuffleCostBuilder {

    const TargetTransformInfo &TTI;


    static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {

      int Index = -1;

      return Mask.empty() ||

             (VF == Mask.size() &&

              ShuffleVectorInst::isIdentityMask(Mask, VF)) ||

             (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

              Index == 0);

    }


  public:

    ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}

    ~ShuffleCostBuilder() = default;

    InstructionCost createShuffleVector(Value *V1, Value *,

                                        ArrayRef<int> Mask) const {

      // Empty mask or identity mask are free.

      unsigned VF =

          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

      if (isEmptyOrIdentity(Mask, VF))

        return TTI::TCC_Free;

      return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,

                              cast<VectorType>(V1->getType()), Mask);

    }

    InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {

      // Empty mask or identity mask are free.

      unsigned VF =

          cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

      if (isEmptyOrIdentity(Mask, VF))

        return TTI::TCC_Free;

      return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,

                              cast<VectorType>(V1->getType()), Mask);

    }

    InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }

    InstructionCost createPoison(Type *Ty, unsigned VF) const {

      return TTI::TCC_Free;

    }

    void resizeToMatch(Value *&, Value *&) const {}

  };


  /// Smart shuffle instruction emission, walks through shuffles trees and

  /// tries to find the best matching vector for the actual shuffle

  /// instruction.

  InstructionCost

  createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,

                const PointerUnion<Value *, const TreeEntry *> &P2,

                ArrayRef<int> Mask) {

    ShuffleCostBuilder Builder(TTI);

    SmallVector<int> CommonMask(Mask);

    Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();

    unsigned CommonVF = Mask.size();

    InstructionCost ExtraCost = 0;

    auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,

                                        unsigned VF) -> InstructionCost {

      if (E.isGather() && allConstant(E.Scalars))

        return TTI::TCC_Free;

      Type *EScalarTy = E.Scalars.front()->getType();

      bool IsSigned = true;

      if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {

        EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);

        IsSigned = It->second.second;

      }

      if (EScalarTy != ScalarTy) {

        unsigned CastOpcode = Instruction::Trunc;

        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);

        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);

        if (DstSz > SrcSz)

          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

        return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),

                                    getWidenedType(EScalarTy, VF),

                                    TTI::CastContextHint::None, CostKind);

      }

      return TTI::TCC_Free;

    };

    auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {

      if (isa<Constant>(V))

        return TTI::TCC_Free;

      auto *VecTy = cast<VectorType>(V->getType());

      Type *EScalarTy = VecTy->getElementType();

      if (EScalarTy != ScalarTy) {

        bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));

        unsigned CastOpcode = Instruction::Trunc;

        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);

        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);

        if (DstSz > SrcSz)

          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

        return TTI.getCastInstrCost(

            CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),

            VecTy, TTI::CastContextHint::None, CostKind);

      }

      return TTI::TCC_Free;

    };

    if (!V1 && !V2 && !P2.isNull()) {

      // Shuffle 2 entry nodes.

      const TreeEntry *E = cast<const TreeEntry *>(P1);

      unsigned VF = E->getVectorFactor();

      const TreeEntry *E2 = cast<const TreeEntry *>(P2);

      CommonVF = std::max(VF, E2->getVectorFactor());

      assert(all_of(Mask,

                    [=](int Idx) {

                      return Idx < 2 * static_cast<int>(CommonVF);

                    }) &&

             "All elements in mask must be less than 2 * CommonVF.");

      if (E->Scalars.size() == E2->Scalars.size()) {

        SmallVector<int> EMask = E->getCommonMask();

        SmallVector<int> E2Mask = E2->getCommonMask();

        if (!EMask.empty() || !E2Mask.empty()) {

          for (int &Idx : CommonMask) {

            if (Idx == PoisonMaskElem)

              continue;

            if (Idx < static_cast<int>(CommonVF) && !EMask.empty())

              Idx = EMask[Idx];

            else if (Idx >= static_cast<int>(CommonVF))

              Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +

                    E->Scalars.size();

          }

        }

        CommonVF = E->Scalars.size();

        ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +

                     GetNodeMinBWAffectedCost(*E2, CommonVF);

      } else {

        ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +

                     GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());

      }

      V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

      V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));

    } else if (!V1 && P2.isNull()) {

      // Shuffle single entry node.

      const TreeEntry *E = cast<const TreeEntry *>(P1);

      unsigned VF = E->getVectorFactor();

      CommonVF = VF;

      assert(

          all_of(Mask,

                 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&

          "All elements in mask must be less than CommonVF.");

      if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

        SmallVector<int> EMask = E->getCommonMask();

        assert(!EMask.empty() && "Expected non-empty common mask.");

        for (int &Idx : CommonMask) {

          if (Idx != PoisonMaskElem)

            Idx = EMask[Idx];

        }

        CommonVF = E->Scalars.size();

      } else if (unsigned Factor = E->getInterleaveFactor();

                 Factor > 0 && E->Scalars.size() != Mask.size() &&

                 ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,

                                                               Factor)) {

        // Deinterleaved nodes are free.

        std::iota(CommonMask.begin(), CommonMask.end(), 0);

      }

      ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);

      V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

      // Not identity/broadcast? Try to see if the original vector is better.

      if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&

          CommonVF == CommonMask.size() &&

          any_of(enumerate(CommonMask),

                 [](const auto &&P) {

                   return P.value() != PoisonMaskElem &&

                          static_cast<unsigned>(P.value()) != P.index();

                 }) &&

          any_of(CommonMask,

                 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {

        SmallVector<int> ReorderMask;

        inversePermutation(E->ReorderIndices, ReorderMask);

        ::addMask(CommonMask, ReorderMask);

      }

    } else if (V1 && P2.isNull()) {

      // Shuffle single vector.

      ExtraCost += GetValueMinBWAffectedCost(V1);

      CommonVF = getVF(V1);

      assert(

          all_of(Mask,

                 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&

          "All elements in mask must be less than CommonVF.");

    } else if (V1 && !V2) {

      // Shuffle vector and tree node.

      unsigned VF = getVF(V1);

      const TreeEntry *E2 = cast<const TreeEntry *>(P2);

      CommonVF = std::max(VF, E2->getVectorFactor());

      assert(all_of(Mask,

                    [=](int Idx) {

                      return Idx < 2 * static_cast<int>(CommonVF);

                    }) &&

             "All elements in mask must be less than 2 * CommonVF.");

      if (E2->Scalars.size() == VF && VF != CommonVF) {

        SmallVector<int> E2Mask = E2->getCommonMask();

        assert(!E2Mask.empty() && "Expected non-empty common mask.");

        for (int &Idx : CommonMask) {

          if (Idx == PoisonMaskElem)

            continue;

          if (Idx >= static_cast<int>(CommonVF))

            Idx = E2Mask[Idx - CommonVF] + VF;

        }

        CommonVF = VF;

      }

      ExtraCost += GetValueMinBWAffectedCost(V1);

      V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

      ExtraCost += GetNodeMinBWAffectedCost(

          *E2, std::min(CommonVF, E2->getVectorFactor()));

      V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));

    } else if (!V1 && V2) {

      // Shuffle vector and tree node.

      unsigned VF = getVF(V2);

      const TreeEntry *E1 = cast<const TreeEntry *>(P1);

      CommonVF = std::max(VF, E1->getVectorFactor());

      assert(all_of(Mask,

                    [=](int Idx) {

                      return Idx < 2 * static_cast<int>(CommonVF);

                    }) &&

             "All elements in mask must be less than 2 * CommonVF.");

      if (E1->Scalars.size() == VF && VF != CommonVF) {

        SmallVector<int> E1Mask = E1->getCommonMask();

        assert(!E1Mask.empty() && "Expected non-empty common mask.");

        for (int &Idx : CommonMask) {

          if (Idx == PoisonMaskElem)

            continue;

          if (Idx >= static_cast<int>(CommonVF))

            Idx = E1Mask[Idx - CommonVF] + VF;

          else

            Idx = E1Mask[Idx];

        }

        CommonVF = VF;

      }

      ExtraCost += GetNodeMinBWAffectedCost(

          *E1, std::min(CommonVF, E1->getVectorFactor()));

      V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

      ExtraCost += GetValueMinBWAffectedCost(V2);

      V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));

    } else {

      assert(V1 && V2 && "Expected both vectors.");

      unsigned VF = getVF(V1);

      CommonVF = std::max(VF, getVF(V2));

      assert(all_of(Mask,

                    [=](int Idx) {

                      return Idx < 2 * static_cast<int>(CommonVF);

                    }) &&

             "All elements in mask must be less than 2 * CommonVF.");

      ExtraCost +=

          GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);

      if (V1->getType() != V2->getType()) {

        V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

        V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));

      } else {

        if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)

          V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

        if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)

          V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));

      }

    }

    InVectors.front() =

        Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));

    if (InVectors.size() == 2)

      InVectors.pop_back();

    return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(

                           V1, V2, CommonMask, Builder, ScalarTy);

  }


public:

  ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,

                       ArrayRef<Value *> VectorizedVals, BoUpSLP &R,

                       SmallPtrSetImpl<Value *> &CheckedExtracts)

      : BaseShuffleAnalysis(ScalarTy), TTI(TTI),

        VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),

        CheckedExtracts(CheckedExtracts) {}

  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,

                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

                        unsigned NumParts, bool &UseVecBaseAsInput) {

    UseVecBaseAsInput = false;

    if (Mask.empty())

      return nullptr;

    Value *VecBase = nullptr;

    SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());

    if (!E->ReorderIndices.empty()) {

      SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

                                   E->ReorderIndices.end());

      reorderScalars(VL, ReorderMask);

    }

    // Check if it can be considered reused if same extractelements were

    // vectorized already.

    bool PrevNodeFound = any_of(

        ArrayRef(R.VectorizableTree).take_front(E->Idx),

        [&](const std::unique_ptr<TreeEntry> &TE) {

          return ((TE->hasState() && !TE->isAltShuffle() &&

                   TE->getOpcode() == Instruction::ExtractElement) ||

                  TE->isGather()) &&

                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {

                   return VL.size() > Data.index() &&

                          (Mask[Data.index()] == PoisonMaskElem ||

                           isa<UndefValue>(VL[Data.index()]) ||

                           Data.value() == VL[Data.index()]);

                 });

        });

    SmallPtrSet<Value *, 4> UniqueBases;

    unsigned SliceSize = getPartNumElems(VL.size(), NumParts);

    SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;

    for (unsigned Part : seq<unsigned>(NumParts)) {

      unsigned Limit = getNumElems(VL.size(), SliceSize, Part);

      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);

      for (auto [I, V] :

           enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {

        // Ignore non-extractelement scalars.

        if (isa<UndefValue>(V) ||

            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))

          continue;

        // If all users of instruction are going to be vectorized and this

        // instruction itself is not going to be vectorized, consider this

        // instruction as dead and remove its cost from the final cost of the

        // vectorized tree.

        // Also, avoid adjusting the cost for extractelements with multiple uses

        // in different graph entries.

        auto *EE = cast<ExtractElementInst>(V);

        VecBase = EE->getVectorOperand();

        UniqueBases.insert(VecBase);

        ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);

        if (!CheckedExtracts.insert(V).second ||

            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||

            any_of(EE->users(),

                   [&](User *U) {

                     return isa<GetElementPtrInst>(U) &&

                            !R.areAllUsersVectorized(cast<Instruction>(U),

                                                     &VectorizedVals);

                   }) ||

            (!VEs.empty() && !is_contained(VEs, E)))

          continue;

        std::optional<unsigned> EEIdx = getExtractIndex(EE);

        if (!EEIdx)

          continue;

        unsigned Idx = *EEIdx;

        // Take credit for instruction that will become dead.

        if (EE->hasOneUse() || !PrevNodeFound) {

          Instruction *Ext = EE->user_back();

          if (isa<SExtInst, ZExtInst>(Ext) &&

              all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {

            // Use getExtractWithExtendCost() to calculate the cost of

            // extractelement/ext pair.

            Cost -= TTI.getExtractWithExtendCost(

                Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),

                Idx, CostKind);

            // Add back the cost of s|zext which is subtracted separately.

            Cost += TTI.getCastInstrCost(

                Ext->getOpcode(), Ext->getType(), EE->getType(),

                TTI::getCastContextHint(Ext), CostKind, Ext);

            continue;

          }

        }

        APInt &DemandedElts =

            VectorOpsToExtracts

                .try_emplace(VecBase,

                             APInt::getZero(getNumElements(VecBase->getType())))

                .first->getSecond();

        DemandedElts.setBit(Idx);

      }

    }

    for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)

      Cost -= TTI.getScalarizationOverhead(cast<VectorType>(Vec->getType()),

                                           DemandedElts, /*Insert=*/false,

                                           /*Extract=*/true, CostKind);

    // Check that gather of extractelements can be represented as just a

    // shuffle of a single/two vectors the scalars are extracted from.

    // Found the bunch of extractelement instructions that must be gathered

    // into a vector and can be represented as a permutation elements in a

    // single input vector or of 2 input vectors.

    // Done for reused if same extractelements were vectorized already.

    if (!PrevNodeFound)

      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);

    InVectors.assign(1, E);

    CommonMask.assign(Mask.begin(), Mask.end());

    transformMaskAfterShuffle(CommonMask, CommonMask);

    SameNodesEstimated = false;

    if (NumParts != 1 && UniqueBases.size() != 1) {

      UseVecBaseAsInput = true;

      VecBase =

          Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));

    }

    return VecBase;

  }

  /// Checks if the specified entry \p E needs to be delayed because of its

  /// dependency nodes.

  std::optional<InstructionCost>

  needToDelay(const TreeEntry *,

              ArrayRef<SmallVector<const TreeEntry *>>) const {

    // No need to delay the cost estimation during analysis.

    return std::nullopt;

  }

  /// Reset the builder to handle perfect diamond match.

  void resetForSameNode() {

    IsFinalized = false;

    CommonMask.clear();

    InVectors.clear();

    Cost = 0;

    VectorizedVals.clear();

    SameNodesEstimated = true;

  }

  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {

    if (&E1 == &E2) {

      assert(all_of(Mask,

                    [&](int Idx) {

                      return Idx < static_cast<int>(E1.getVectorFactor());

                    }) &&

             "Expected single vector shuffle mask.");

      add(E1, Mask);

      return;

    }

    if (InVectors.empty()) {

      CommonMask.assign(Mask.begin(), Mask.end());

      InVectors.assign({&E1, &E2});

      return;

    }

    assert(!CommonMask.empty() && "Expected non-empty common mask.");

    auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());

    unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());

    unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);

    const auto *It =

        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });

    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

    estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);

  }

  void add(const TreeEntry &E1, ArrayRef<int> Mask) {

    if (InVectors.empty()) {

      CommonMask.assign(Mask.begin(), Mask.end());

      InVectors.assign(1, &E1);

      return;

    }

    assert(!CommonMask.empty() && "Expected non-empty common mask.");

    auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());

    unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());

    unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);

    const auto *It =

        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });

    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

    estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);

    if (!SameNodesEstimated && InVectors.size() == 1)

      InVectors.emplace_back(&E1);

  }

  /// Adds 2 input vectors and the mask for their shuffling.

  void add(Value *V1, Value *V2, ArrayRef<int> Mask) {

    // May come only for shuffling of 2 vectors with extractelements, already

    // handled in adjustExtracts.

    assert(InVectors.size() == 1 &&

           all_of(enumerate(CommonMask),

                  [&](auto P) {

                    if (P.value() == PoisonMaskElem)

                      return Mask[P.index()] == PoisonMaskElem;

                    auto *EI = cast<ExtractElementInst>(

                        cast<const TreeEntry *>(InVectors.front())

                            ->getOrdered(P.index()));

                    return EI->getVectorOperand() == V1 ||

                           EI->getVectorOperand() == V2;

                  }) &&

           "Expected extractelement vectors.");

  }

  /// Adds another one input vector and the mask for the shuffling.

  void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {

    if (InVectors.empty()) {

      assert(CommonMask.empty() && !ForExtracts &&

             "Expected empty input mask/vectors.");

      CommonMask.assign(Mask.begin(), Mask.end());

      InVectors.assign(1, V1);

      return;

    }

    if (ForExtracts) {

      // No need to add vectors here, already handled them in adjustExtracts.

      assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&

             !CommonMask.empty() &&

             all_of(enumerate(CommonMask),

                    [&](auto P) {

                      Value *Scalar = cast<const TreeEntry *>(InVectors[0])

                                          ->getOrdered(P.index());

                      if (P.value() == PoisonMaskElem)

                        return P.value() == Mask[P.index()] ||

                               isa<UndefValue>(Scalar);

                      if (isa<Constant>(V1))

                        return true;

                      auto *EI = cast<ExtractElementInst>(Scalar);

                      return EI->getVectorOperand() == V1;

                    }) &&

             "Expected only tree entry for extractelement vectors.");

      return;

    }

    assert(!InVectors.empty() && !CommonMask.empty() &&

           "Expected only tree entries from extracts/reused buildvectors.");

    unsigned VF = getVF(V1);

    if (InVectors.size() == 2) {

      Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

      VF = std::max<unsigned>(VF, CommonMask.size());

    } else if (const auto *InTE =

                   InVectors.front().dyn_cast<const TreeEntry *>()) {

      VF = std::max(VF, InTE->getVectorFactor());

    } else {

      VF = std::max(

          VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())

                  ->getNumElements());

    }

    InVectors.push_back(V1);

    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)

        CommonMask[Idx] = Mask[Idx] + VF;

  }

  Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,

                Value *Root = nullptr) {

    Cost += getBuildVectorCost(VL, Root);

    if (!Root) {

      // FIXME: Need to find a way to avoid use of getNullValue here.

      SmallVector<Constant *> Vals;

      unsigned VF = VL.size();

      if (MaskVF != 0)

        VF = std::min(VF, MaskVF);

      Type *VLScalarTy = VL.front()->getType();

      for (Value *V : VL.take_front(VF)) {

        Type *ScalarTy = VLScalarTy->getScalarType();

        if (isa<PoisonValue>(V)) {

          Vals.push_back(PoisonValue::get(ScalarTy));

          continue;

        }

        if (isa<UndefValue>(V)) {

          Vals.push_back(UndefValue::get(ScalarTy));

          continue;

        }

        Vals.push_back(Constant::getNullValue(ScalarTy));

      }

      if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {

        assert(SLPReVec && "FixedVectorType is not expected.");

        // When REVEC is enabled, we need to expand vector types into scalar

        // types.

        Vals = replicateMask(Vals, VecTy->getNumElements());

      }

      return ConstantVector::get(Vals);

    }

    return ConstantVector::getSplat(

        ElementCount::getFixed(

            cast<FixedVectorType>(Root->getType())->getNumElements()),

        getAllOnesValue(*R.DL, ScalarTy->getScalarType()));

  }

  InstructionCost createFreeze(InstructionCost Cost) { return Cost; }

  /// Finalize emission of the shuffles.

  InstructionCost finalize(

      ArrayRef<int> ExtMask,

      ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,

      ArrayRef<int> SubVectorsMask, unsigned VF = 0,

      function_ref<void(Value *&, SmallVectorImpl<int> &,

                        function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>

          Action = {}) {

    IsFinalized = true;

    if (Action) {

      const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

      if (InVectors.size() == 2)

        Cost += createShuffle(Vec, InVectors.back(), CommonMask);

      else

        Cost += createShuffle(Vec, nullptr, CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

      assert(VF > 0 &&

             "Expected vector length for the final value before action.");

      Value *V = cast<Value *>(Vec);

      Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {

        Cost += createShuffle(V1, V2, Mask);

        return V1;

      });

      InVectors.front() = V;

    }

    if (!SubVectors.empty()) {

      const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

      if (InVectors.size() == 2)

        Cost += createShuffle(Vec, InVectors.back(), CommonMask);

      else

        Cost += createShuffle(Vec, nullptr, CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

      // Add subvectors permutation cost.

      if (!SubVectorsMask.empty()) {

        assert(SubVectorsMask.size() <= CommonMask.size() &&

               "Expected same size of masks for subvectors and common mask.");

        SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);

        copy(SubVectorsMask, SVMask.begin());

        for (auto [I1, I2] : zip(SVMask, CommonMask)) {

          if (I2 != PoisonMaskElem) {

            assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");

            I1 = I2 + CommonMask.size();

          }

        }

        Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,

                                 getWidenedType(ScalarTy, CommonMask.size()),

                                 SVMask, CostKind);

      }

      for (auto [E, Idx] : SubVectors) {

        Type *EScalarTy = E->Scalars.front()->getType();

        bool IsSigned = true;

        if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {

          EScalarTy =

              IntegerType::get(EScalarTy->getContext(), It->second.first);

          IsSigned = It->second.second;

        }

        if (ScalarTy != EScalarTy) {

          unsigned CastOpcode = Instruction::Trunc;

          unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);

          unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);

          if (DstSz > SrcSz)

            CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

          Cost += TTI.getCastInstrCost(

              CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),

              getWidenedType(EScalarTy, E->getVectorFactor()),

              TTI::CastContextHint::Normal, CostKind);

        }

        Cost += ::getShuffleCost(

            TTI, TTI::SK_InsertSubvector,

            getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,

            getWidenedType(ScalarTy, E->getVectorFactor()));

        if (!CommonMask.empty()) {

          std::iota(std::next(CommonMask.begin(), Idx),

                    std::next(CommonMask.begin(), Idx + E->getVectorFactor()),

                    Idx);

        }

      }

    }


    if (!ExtMask.empty()) {

      if (CommonMask.empty()) {

        CommonMask.assign(ExtMask.begin(), ExtMask.end());

      } else {

        SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);

        for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {

          if (ExtMask[I] == PoisonMaskElem)

            continue;

          NewMask[I] = CommonMask[ExtMask[I]];

        }

        CommonMask.swap(NewMask);

      }

    }

    if (CommonMask.empty()) {

      assert(InVectors.size() == 1 && "Expected only one vector with no mask");

      return Cost;

    }

    return Cost +

           createShuffle(InVectors.front(),

                         InVectors.size() == 2 ? InVectors.back() : nullptr,

                         CommonMask);

  }


  ~ShuffleCostEstimator() {

    assert((IsFinalized || CommonMask.empty()) &&

           "Shuffle construction must be finalized.");

  }

};


const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,

                                                   unsigned Idx) const {

  TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});

  assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");

  return Op;

}


TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {

  if (TE.State == TreeEntry::ScatterVectorize ||

      TE.State == TreeEntry::StridedVectorize)

    return TTI::CastContextHint::GatherScatter;

  if (TE.State == TreeEntry::CompressVectorize)

    return TTI::CastContextHint::Masked;

  if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&

      !TE.isAltShuffle()) {

    if (TE.ReorderIndices.empty())

      return TTI::CastContextHint::Normal;

    SmallVector<int> Mask;

    inversePermutation(TE.ReorderIndices, Mask);

    if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))

      return TTI::CastContextHint::Reversed;

  }

  return TTI::CastContextHint::None;

}


InstructionCost

BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

                      SmallPtrSetImpl<Value *> &CheckedExtracts) {

  ArrayRef<Value *> VL = E->Scalars;


  Type *ScalarTy = getValueType(VL[0]);

  if (!isValidElementType(ScalarTy))

    return InstructionCost::getInvalid();

  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;


  // If we have computed a smaller type for the expression, update VecTy so

  // that the costs will be accurate.

  auto It = MinBWs.find(E);

  Type *OrigScalarTy = ScalarTy;

  if (It != MinBWs.end()) {

    auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);

    ScalarTy = IntegerType::get(F->getContext(), It->second.first);

    if (VecTy)

      ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());

  }

  auto *VecTy = getWidenedType(ScalarTy, VL.size());

  unsigned EntryVF = E->getVectorFactor();

  auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);


  if (E->isGather()) {

    if (allConstant(VL))

      return 0;

    if (isa<InsertElementInst>(VL[0]))

      return InstructionCost::getInvalid();

    if (isa<CmpInst>(VL.front()))

      ScalarTy = VL.front()->getType();

    return processBuildVector<ShuffleCostEstimator, InstructionCost>(

        E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);

  }

  if (E->State == TreeEntry::SplitVectorize) {

    assert(E->CombinedEntriesWithIndices.size() == 2 &&

           "Expected exactly 2 combined entries.");

    assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");

    InstructionCost VectorCost = 0;

    if (E->ReorderIndices.empty()) {

      VectorCost = ::getShuffleCost(

          *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,

          E->CombinedEntriesWithIndices.back().second,

          getWidenedType(

              ScalarTy,

              VectorizableTree[E->CombinedEntriesWithIndices.back().first]

                  ->getVectorFactor()));

    } else {

      unsigned CommonVF =

          std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]

                       ->getVectorFactor(),

                   VectorizableTree[E->CombinedEntriesWithIndices.back().first]

                       ->getVectorFactor());

      VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,

                                    getWidenedType(ScalarTy, CommonVF),

                                    E->getSplitMask(), CostKind);

    }

    LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));

    return VectorCost;

  }

  InstructionCost CommonCost = 0;

  SmallVector<int> Mask;

  if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&

      (E->State != TreeEntry::StridedVectorize ||

       !isReverseOrder(E->ReorderIndices))) {

    SmallVector<int> NewMask;

    if (E->getOpcode() == Instruction::Store) {

      // For stores the order is actually a mask.

      NewMask.resize(E->ReorderIndices.size());

      copy(E->ReorderIndices, NewMask.begin());

    } else {

      inversePermutation(E->ReorderIndices, NewMask);

    }

    ::addMask(Mask, NewMask);

  }

  if (!E->ReuseShuffleIndices.empty())

    ::addMask(Mask, E->ReuseShuffleIndices);

  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

    CommonCost =

        ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);

  assert((E->State == TreeEntry::Vectorize ||

          E->State == TreeEntry::ScatterVectorize ||

          E->State == TreeEntry::StridedVectorize ||

          E->State == TreeEntry::CompressVectorize) &&

         "Unhandled state");

  assert(E->getOpcode() &&

         ((allSameType(VL) && allSameBlock(VL)) ||

          (E->getOpcode() == Instruction::GetElementPtr &&

           E->getMainOp()->getType()->isPointerTy()) ||

          E->hasCopyableElements()) &&

         "Invalid VL");

  Instruction *VL0 = E->getMainOp();

  unsigned ShuffleOrOp =

      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

  if (E->CombinedOp != TreeEntry::NotCombinedOp)

    ShuffleOrOp = E->CombinedOp;

  SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());

  const unsigned Sz = UniqueValues.size();

  SmallBitVector UsedScalars(Sz, false);

  for (unsigned I = 0; I < Sz; ++I) {

    if (isa<Instruction>(UniqueValues[I]) &&

        !E->isCopyableElement(UniqueValues[I]) &&

        getTreeEntries(UniqueValues[I]).front() == E)

      continue;

    UsedScalars.set(I);

  }

  auto GetCastContextHint = [&](Value *V) {

    if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)

      return getCastContextHint(*OpTEs.front());

    InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);

    if (SrcState && SrcState.getOpcode() == Instruction::Load &&

        !SrcState.isAltShuffle())

      return TTI::CastContextHint::GatherScatter;

    return TTI::CastContextHint::None;

  };

  auto GetCostDiff =

      [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,

          function_ref<InstructionCost(InstructionCost)> VectorCost) {

        // Calculate the cost of this instruction.

        InstructionCost ScalarCost = 0;

        if (isa<CastInst, CallInst>(VL0)) {

          // For some of the instructions no need to calculate cost for each

          // particular instruction, we can use the cost of the single

          // instruction x total number of scalar instructions.

          ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);

        } else {

          for (unsigned I = 0; I < Sz; ++I) {

            if (UsedScalars.test(I))

              continue;

            ScalarCost += ScalarEltCost(I);

          }

        }


        InstructionCost VecCost = VectorCost(CommonCost);

        // Check if the current node must be resized, if the parent node is not

        // resized.

        if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&

            E->Idx != 0 &&

            (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {

          const EdgeInfo &EI = E->UserTreeIndex;

          if (!EI.UserTE->hasState() ||

              EI.UserTE->getOpcode() != Instruction::Select ||

              EI.EdgeIdx != 0) {

            auto UserBWIt = MinBWs.find(EI.UserTE);

            Type *UserScalarTy =

                (EI.UserTE->isGather() ||

                 EI.UserTE->State == TreeEntry::SplitVectorize)

                    ? EI.UserTE->Scalars.front()->getType()

                    : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();

            if (UserBWIt != MinBWs.end())

              UserScalarTy = IntegerType::get(ScalarTy->getContext(),

                                              UserBWIt->second.first);

            if (ScalarTy != UserScalarTy) {

              unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

              unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);

              unsigned VecOpcode;

              auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());

              if (BWSz > SrcBWSz)

                VecOpcode = Instruction::Trunc;

              else

                VecOpcode =

                    It->second.second ? Instruction::SExt : Instruction::ZExt;

              TTI::CastContextHint CCH = GetCastContextHint(VL0);

              VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,

                                               CostKind);

            }

          }

        }

        LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,

                                 ScalarCost, "Calculated costs for Tree"));

        return VecCost - ScalarCost;

      };

  // Calculate cost difference from vectorizing set of GEPs.

  // Negative value means vectorizing is profitable.

  auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {

    assert((E->State == TreeEntry::Vectorize ||

            E->State == TreeEntry::StridedVectorize ||

            E->State == TreeEntry::CompressVectorize) &&

           "Entry state expected to be Vectorize, StridedVectorize or "

           "MaskedLoadCompressVectorize here.");

    InstructionCost ScalarCost = 0;

    InstructionCost VecCost = 0;

    std::tie(ScalarCost, VecCost) = getGEPCosts(

        *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);

    LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,

                             "Calculated GEPs cost for Tree"));


    return VecCost - ScalarCost;

  };


  auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {

    auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);

    if (MinMaxID == Intrinsic::not_intrinsic)

      return InstructionCost::getInvalid();

    Type *CanonicalType = Ty;

    if (CanonicalType->isPtrOrPtrVectorTy())

      CanonicalType = CanonicalType->getWithNewType(IntegerType::get(

          CanonicalType->getContext(),

          DL->getTypeSizeInBits(CanonicalType->getScalarType())));


    IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,

                                      {CanonicalType, CanonicalType});

    InstructionCost IntrinsicCost =

        TTI->getIntrinsicInstrCost(CostAttrs, CostKind);

    // If the selects are the only uses of the compares, they will be

    // dead and we can adjust the cost by removing their cost.

    if (VI && SelectOnly) {

      assert((!Ty->isVectorTy() || SLPReVec) &&

             "Expected only for scalar type.");

      auto *CI = cast<CmpInst>(VI->getOperand(0));

      IntrinsicCost -= TTI->getCmpSelInstrCost(

          CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),

          CostKind, {TTI::OK_AnyValue, TTI::OP_None},

          {TTI::OK_AnyValue, TTI::OP_None}, CI);

    }

    return IntrinsicCost;

  };

  auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,

                                         Instruction *VI) {

    InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);

    return Cost;

  };

  switch (ShuffleOrOp) {

  case Instruction::PHI: {

    // Count reused scalars.

    InstructionCost ScalarCost = 0;

    SmallPtrSet<const TreeEntry *, 4> CountedOps;

    for (Value *V : UniqueValues) {

      auto *PHI = dyn_cast<PHINode>(V);

      if (!PHI)

        continue;


      ValueList Operands(PHI->getNumIncomingValues(), nullptr);

      for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {

        Value *Op = PHI->getIncomingValue(I);

        Operands[I] = Op;

      }

      if (const TreeEntry *OpTE =

              getSameValuesTreeEntry(Operands.front(), Operands))

        if (CountedOps.insert(OpTE).second &&

            !OpTE->ReuseShuffleIndices.empty())

          ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -

                                          OpTE->Scalars.size());

    }


    return CommonCost - ScalarCost;

  }

  case Instruction::ExtractValue:

  case Instruction::ExtractElement: {

    APInt DemandedElts;

    VectorType *SrcVecTy = nullptr;

    auto GetScalarCost = [&](unsigned Idx) {

      if (isa<PoisonValue>(UniqueValues[Idx]))

        return InstructionCost(TTI::TCC_Free);


      auto *I = cast<Instruction>(UniqueValues[Idx]);

      if (!SrcVecTy) {

        if (ShuffleOrOp == Instruction::ExtractElement) {

          auto *EE = cast<ExtractElementInst>(I);

          SrcVecTy = EE->getVectorOperandType();

        } else {

          auto *EV = cast<ExtractValueInst>(I);

          Type *AggregateTy = EV->getAggregateOperand()->getType();

          unsigned NumElts;

          if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))

            NumElts = ATy->getNumElements();

          else

            NumElts = AggregateTy->getStructNumElements();

          SrcVecTy = getWidenedType(OrigScalarTy, NumElts);

        }

      }

      if (I->hasOneUse()) {

        Instruction *Ext = I->user_back();

        if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&

            all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {

          // Use getExtractWithExtendCost() to calculate the cost of

          // extractelement/ext pair.

          InstructionCost Cost = TTI->getExtractWithExtendCost(

              Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),

              CostKind);

          // Subtract the cost of s|zext which is subtracted separately.

          Cost -= TTI->getCastInstrCost(

              Ext->getOpcode(), Ext->getType(), I->getType(),

              TTI::getCastContextHint(Ext), CostKind, Ext);

          return Cost;

        }

      }

      if (DemandedElts.isZero())

        DemandedElts = APInt::getZero(getNumElements(SrcVecTy));

      DemandedElts.setBit(*getExtractIndex(I));

      return InstructionCost(TTI::TCC_Free);

    };

    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {

      return CommonCost - (DemandedElts.isZero()

                               ? TTI::TCC_Free

                               : TTI.getScalarizationOverhead(

                                     SrcVecTy, DemandedElts, /*Insert=*/false,

                                     /*Extract=*/true, CostKind));

    };

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case Instruction::InsertElement: {

    assert(E->ReuseShuffleIndices.empty() &&

           "Unique insertelements only are expected.");

    auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());

    unsigned const NumElts = SrcVecTy->getNumElements();

    unsigned const NumScalars = VL.size();


    unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);


    SmallVector<int> InsertMask(NumElts, PoisonMaskElem);

    unsigned OffsetBeg = *getElementIndex(VL.front());

    unsigned OffsetEnd = OffsetBeg;

    InsertMask[OffsetBeg] = 0;

    for (auto [I, V] : enumerate(VL.drop_front())) {

      unsigned Idx = *getElementIndex(V);

      if (OffsetBeg > Idx)

        OffsetBeg = Idx;

      else if (OffsetEnd < Idx)

        OffsetEnd = Idx;

      InsertMask[Idx] = I + 1;

    }

    unsigned VecScalarsSz = PowerOf2Ceil(NumElts);

    if (NumOfParts > 0 && NumOfParts < NumElts)

      VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);

    unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *

                     VecScalarsSz;

    unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);

    unsigned InsertVecSz = std::min<unsigned>(

        PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),

        ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);

    bool IsWholeSubvector =

        OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);

    // Check if we can safely insert a subvector. If it is not possible, just

    // generate a whole-sized vector and shuffle the source vector and the new

    // subvector.

    if (OffsetBeg + InsertVecSz > VecSz) {

      // Align OffsetBeg to generate correct mask.

      OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);

      InsertVecSz = VecSz;

    }


    APInt DemandedElts = APInt::getZero(NumElts);

    // TODO: Add support for Instruction::InsertValue.

    SmallVector<int> Mask;

    if (!E->ReorderIndices.empty()) {

      inversePermutation(E->ReorderIndices, Mask);

      Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);

    } else {

      Mask.assign(VecSz, PoisonMaskElem);

      std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);

    }

    bool IsIdentity = true;

    SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);

    Mask.swap(PrevMask);

    for (unsigned I = 0; I < NumScalars; ++I) {

      unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);

      DemandedElts.setBit(InsertIdx);

      IsIdentity &= InsertIdx - OffsetBeg == I;

      Mask[InsertIdx - OffsetBeg] = I;

    }

    assert(Offset < NumElts && "Failed to find vector index offset");


    InstructionCost Cost = 0;

    Cost -=

        getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,

                                 /*Insert*/ true, /*Extract*/ false, CostKind);


    // First cost - resize to actual vector size if not identity shuffle or

    // need to shift the vector.

    // Do not calculate the cost if the actual size is the register size and

    // we can merge this shuffle with the following SK_Select.

    auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);

    if (!IsIdentity)

      Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,

                               InsertVecTy, Mask);

    auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

      return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

    }));

    // Second cost - permutation with subvector, if some elements are from the

    // initial vector or inserting a subvector.

    // TODO: Implement the analysis of the FirstInsert->getOperand(0)

    // subvector of ActualVecTy.

    SmallBitVector InMask =

        isUndefVector(FirstInsert->getOperand(0),

                      buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));

    if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {

      if (InsertVecSz != VecSz) {

        auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);

        Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},

                                 CostKind, OffsetBeg - Offset, InsertVecTy);

      } else {

        for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)

          Mask[I] = InMask.test(I) ? PoisonMaskElem : I;

        for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;

             I <= End; ++I)

          if (Mask[I] != PoisonMaskElem)

            Mask[I] = I + VecSz;

        for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)

          Mask[I] =

              ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;

        Cost +=

            ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);

      }

    }

    return Cost;

  }

  case Instruction::ZExt:

  case Instruction::SExt:

  case Instruction::FPToUI:

  case Instruction::FPToSI:

  case Instruction::FPExt:

  case Instruction::PtrToInt:

  case Instruction::IntToPtr:

  case Instruction::SIToFP:

  case Instruction::UIToFP:

  case Instruction::Trunc:

  case Instruction::FPTrunc:

  case Instruction::BitCast: {

    auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

    Type *SrcScalarTy = VL0->getOperand(0)->getType();

    auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());

    unsigned Opcode = ShuffleOrOp;

    unsigned VecOpcode = Opcode;

    if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&

        (SrcIt != MinBWs.end() || It != MinBWs.end())) {

      // Check if the values are candidates to demote.

      unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());

      if (SrcIt != MinBWs.end()) {

        SrcBWSz = SrcIt->second.first;

        unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);

        SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);

        SrcVecTy =

            getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);

      }

      unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());

      if (BWSz == SrcBWSz) {

        VecOpcode = Instruction::BitCast;

      } else if (BWSz < SrcBWSz) {

        VecOpcode = Instruction::Trunc;

      } else if (It != MinBWs.end()) {

        assert(BWSz > SrcBWSz && "Invalid cast!");

        VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

      } else if (SrcIt != MinBWs.end()) {

        assert(BWSz > SrcBWSz && "Invalid cast!");

        VecOpcode =

            SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;

      }

    } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&

               !SrcIt->second.second) {

      VecOpcode = Instruction::UIToFP;

    }

    auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {

      assert(Idx == 0 && "Expected 0 index only");

      return TTI->getCastInstrCost(Opcode, VL0->getType(),

                                   VL0->getOperand(0)->getType(),

                                   TTI::getCastContextHint(VL0), CostKind, VL0);

    };

    auto GetVectorCost = [=](InstructionCost CommonCost) {

      // Do not count cost here if minimum bitwidth is in effect and it is just

      // a bitcast (here it is just a noop).

      if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)

        return CommonCost;

      auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;

      TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));


      bool IsArithmeticExtendedReduction =

          E->Idx == 0 && UserIgnoreList &&

          all_of(*UserIgnoreList, [](Value *V) {

            auto *I = cast<Instruction>(V);

            return is_contained({Instruction::Add, Instruction::FAdd,

                                 Instruction::Mul, Instruction::FMul,

                                 Instruction::And, Instruction::Or,

                                 Instruction::Xor},

                                I->getOpcode());

          });

      if (IsArithmeticExtendedReduction &&

          (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))

        return CommonCost;

      return CommonCost +

             TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,

                                   VecOpcode == Opcode ? VI : nullptr);

    };

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case Instruction::FCmp:

  case Instruction::ICmp:

  case Instruction::Select: {

    CmpPredicate VecPred, SwappedVecPred;

    auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());

    if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||

        match(VL0, MatchCmp))

      SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);

    else

      SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()

                                     ? CmpInst::BAD_FCMP_PREDICATE

                                     : CmpInst::BAD_ICMP_PREDICATE;

    auto GetScalarCost = [&](unsigned Idx) {

      if (isa<PoisonValue>(UniqueValues[Idx]))

        return InstructionCost(TTI::TCC_Free);


      auto *VI = cast<Instruction>(UniqueValues[Idx]);

      CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()

                                     ? CmpInst::BAD_FCMP_PREDICATE

                                     : CmpInst::BAD_ICMP_PREDICATE;

      auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());

      if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&

           !match(VI, MatchCmp)) ||

          (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&

           CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))

        VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()

                                       ? CmpInst::BAD_FCMP_PREDICATE

                                       : CmpInst::BAD_ICMP_PREDICATE;


      InstructionCost ScalarCost = TTI->getCmpSelInstrCost(

          E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,

          CostKind, getOperandInfo(VI->getOperand(0)),

          getOperandInfo(VI->getOperand(1)), VI);

      InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);

      if (IntrinsicCost.isValid())

        ScalarCost = IntrinsicCost;


      return ScalarCost;

    };

    auto GetVectorCost = [&](InstructionCost CommonCost) {

      auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());


      InstructionCost VecCost =

          TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,

                                  CostKind, getOperandInfo(E->getOperand(0)),

                                  getOperandInfo(E->getOperand(1)), VL0);

      if (auto *SI = dyn_cast<SelectInst>(VL0)) {

        auto *CondType =

            getWidenedType(SI->getCondition()->getType(), VL.size());

        unsigned CondNumElements = CondType->getNumElements();

        unsigned VecTyNumElements = getNumElements(VecTy);

        assert(VecTyNumElements >= CondNumElements &&

               VecTyNumElements % CondNumElements == 0 &&

               "Cannot vectorize Instruction::Select");

        if (CondNumElements != VecTyNumElements) {

          // When the return type is i1 but the source is fixed vector type, we

          // need to duplicate the condition value.

          VecCost += ::getShuffleCost(

              *TTI, TTI::SK_PermuteSingleSrc, CondType,

              createReplicatedMask(VecTyNumElements / CondNumElements,

                                   CondNumElements));

        }

      }

      return VecCost + CommonCost;

    };

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case TreeEntry::MinMax: {

    auto GetScalarCost = [&](unsigned Idx) {

      return GetMinMaxCost(OrigScalarTy);

    };

    auto GetVectorCost = [&](InstructionCost CommonCost) {

      InstructionCost VecCost = GetMinMaxCost(VecTy);

      return VecCost + CommonCost;

    };

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case TreeEntry::FMulAdd: {

    auto GetScalarCost = [&](unsigned Idx) {

      if (isa<PoisonValue>(UniqueValues[Idx]))

        return InstructionCost(TTI::TCC_Free);

      return GetFMulAddCost(E->getOperations(),

                            cast<Instruction>(UniqueValues[Idx]));

    };

    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {

      FastMathFlags FMF;

      FMF.set();

      for (Value *V : E->Scalars) {

        if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {

          FMF &= FPCI->getFastMathFlags();

          if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))

            FMF &= FPCIOp->getFastMathFlags();

        }

      }

      IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,

                                  {VecTy, VecTy, VecTy}, FMF);

      InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);

      return VecCost + CommonCost;

    };

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case Instruction::FNeg:

  case Instruction::Add:

  case Instruction::FAdd:

  case Instruction::Sub:

  case Instruction::FSub:

  case Instruction::Mul:

  case Instruction::FMul:

  case Instruction::UDiv:

  case Instruction::SDiv:

  case Instruction::FDiv:

  case Instruction::URem:

  case Instruction::SRem:

  case Instruction::FRem:

  case Instruction::Shl:

  case Instruction::LShr:

  case Instruction::AShr:

  case Instruction::And:

  case Instruction::Or:

  case Instruction::Xor: {

    auto GetScalarCost = [&](unsigned Idx) {

      if (isa<PoisonValue>(UniqueValues[Idx]))

        return InstructionCost(TTI::TCC_Free);


      // We cannot retrieve the operand from UniqueValues[Idx] because an

      // interchangeable instruction may be used. The order and the actual

      // operand might differ from what is retrieved from UniqueValues[Idx].

      Value *Op1 = E->getOperand(0)[Idx];

      Value *Op2;

      SmallVector<const Value *, 2> Operands(1, Op1);

      if (isa<UnaryOperator>(UniqueValues[Idx])) {

        Op2 = Op1;

      } else {

        Op2 = E->getOperand(1)[Idx];

        Operands.push_back(Op2);

      }

      TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);

      TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);

      InstructionCost ScalarCost = TTI->getArithmeticInstrCost(

          ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);

      if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);

          I && (ShuffleOrOp == Instruction::FAdd ||

                ShuffleOrOp == Instruction::FSub)) {

        InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);

        if (IntrinsicCost.isValid())

          ScalarCost = IntrinsicCost;

      }

      return ScalarCost;

    };

    auto GetVectorCost = [=](InstructionCost CommonCost) {

      if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {

        for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {

          ArrayRef<Value *> Ops = E->getOperand(I);

          if (all_of(Ops, [&](Value *Op) {

                auto *CI = dyn_cast<ConstantInt>(Op);

                return CI && CI->getValue().countr_one() >= It->second.first;

              }))

            return CommonCost;

        }

      }

      unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;

      TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));

      TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));

      return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,

                                         Op2Info, {}, nullptr, TLI) +

             CommonCost;

    };

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case Instruction::GetElementPtr: {

    return CommonCost + GetGEPCostDiff(VL, VL0);

  }

  case Instruction::Load: {

    auto GetScalarCost = [&](unsigned Idx) {

      auto *VI = cast<LoadInst>(UniqueValues[Idx]);

      return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,

                                  VI->getAlign(), VI->getPointerAddressSpace(),

                                  CostKind, TTI::OperandValueInfo(), VI);

    };

    auto *LI0 = cast<LoadInst>(VL0);

    auto GetVectorCost = [&](InstructionCost CommonCost) {

      InstructionCost VecLdCost;

      switch (E->State) {

      case TreeEntry::Vectorize:

        if (unsigned Factor = E->getInterleaveFactor()) {

          VecLdCost = TTI->getInterleavedMemoryOpCost(

              Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),

              LI0->getPointerAddressSpace(), CostKind);


        } else {

          VecLdCost = TTI->getMemoryOpCost(

              Instruction::Load, VecTy, LI0->getAlign(),

              LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());

        }

        break;

      case TreeEntry::StridedVectorize: {

        Align CommonAlignment =

            computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());

        VecLdCost = TTI->getStridedMemoryOpCost(

            Instruction::Load, VecTy, LI0->getPointerOperand(),

            /*VariableMask=*/false, CommonAlignment, CostKind);

        break;

      }

      case TreeEntry::CompressVectorize: {

        bool IsMasked;

        unsigned InterleaveFactor;

        SmallVector<int> CompressMask;

        VectorType *LoadVecTy;

        SmallVector<Value *> Scalars(VL);

        if (!E->ReorderIndices.empty()) {

          SmallVector<int> Mask(E->ReorderIndices.begin(),

                                E->ReorderIndices.end());

          reorderScalars(Scalars, Mask);

        }

        SmallVector<Value *> PointerOps(Scalars.size());

        for (auto [I, V] : enumerate(Scalars))

          PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();

        [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(

            Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,

            *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,

            CompressMask, LoadVecTy);

        assert(IsVectorized && "Failed to vectorize load");

        CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,

                                        InterleaveFactor, IsMasked);

        Align CommonAlignment = LI0->getAlign();

        if (InterleaveFactor) {

          VecLdCost = TTI->getInterleavedMemoryOpCost(

              Instruction::Load, LoadVecTy, InterleaveFactor, {},

              CommonAlignment, LI0->getPointerAddressSpace(), CostKind);

        } else if (IsMasked) {

          VecLdCost = TTI->getMaskedMemoryOpCost(

              Instruction::Load, LoadVecTy, CommonAlignment,

              LI0->getPointerAddressSpace(), CostKind);

          // TODO: include this cost into CommonCost.

          VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,

                                        LoadVecTy, CompressMask, CostKind);

        } else {

          VecLdCost = TTI->getMemoryOpCost(

              Instruction::Load, LoadVecTy, CommonAlignment,

              LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());

          // TODO: include this cost into CommonCost.

          VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,

                                        LoadVecTy, CompressMask, CostKind);

        }

        break;

      }

      case TreeEntry::ScatterVectorize: {

        Align CommonAlignment =

            computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());

        VecLdCost = TTI->getGatherScatterOpCost(

            Instruction::Load, VecTy, LI0->getPointerOperand(),

            /*VariableMask=*/false, CommonAlignment, CostKind);

        break;

      }

      case TreeEntry::CombinedVectorize:

      case TreeEntry::SplitVectorize:

      case TreeEntry::NeedToGather:

        llvm_unreachable("Unexpected vectorization state.");

      }

      return VecLdCost + CommonCost;

    };


    InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);

    // If this node generates masked gather load then it is not a terminal node.

    // Hence address operand cost is estimated separately.

    if (E->State == TreeEntry::ScatterVectorize)

      return Cost;


    // Estimate cost of GEPs since this tree node is a terminator.

    SmallVector<Value *> PointerOps(VL.size());

    for (auto [I, V] : enumerate(VL))

      PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();

    return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());

  }

  case Instruction::Store: {

    bool IsReorder = !E->ReorderIndices.empty();

    auto GetScalarCost = [=](unsigned Idx) {

      auto *VI = cast<StoreInst>(VL[Idx]);

      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());

      return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,

                                  VI->getAlign(), VI->getPointerAddressSpace(),

                                  CostKind, OpInfo, VI);

    };

    auto *BaseSI =

        cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

    auto GetVectorCost = [=](InstructionCost CommonCost) {

      // We know that we can merge the stores. Calculate the cost.

      InstructionCost VecStCost;

      if (E->State == TreeEntry::StridedVectorize) {

        Align CommonAlignment =

            computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());

        VecStCost = TTI->getStridedMemoryOpCost(

            Instruction::Store, VecTy, BaseSI->getPointerOperand(),

            /*VariableMask=*/false, CommonAlignment, CostKind);

      } else {

        assert(E->State == TreeEntry::Vectorize &&

               "Expected either strided or consecutive stores.");

        if (unsigned Factor = E->getInterleaveFactor()) {

          assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&

                 "No reused shuffles expected");

          CommonCost = 0;

          VecStCost = TTI->getInterleavedMemoryOpCost(

              Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),

              BaseSI->getPointerAddressSpace(), CostKind);

        } else {

          TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));

          VecStCost = TTI->getMemoryOpCost(

              Instruction::Store, VecTy, BaseSI->getAlign(),

              BaseSI->getPointerAddressSpace(), CostKind, OpInfo);

        }

      }

      return VecStCost + CommonCost;

    };

    SmallVector<Value *> PointerOps(VL.size());

    for (auto [I, V] : enumerate(VL)) {

      unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;

      PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();

    }


    return GetCostDiff(GetScalarCost, GetVectorCost) +

           GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());

  }

  case Instruction::Call: {

    auto GetScalarCost = [&](unsigned Idx) {

      auto *CI = cast<CallInst>(UniqueValues[Idx]);

      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

      if (ID != Intrinsic::not_intrinsic) {

        IntrinsicCostAttributes CostAttrs(ID, *CI, 1);

        return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);

      }

      return TTI->getCallInstrCost(CI->getCalledFunction(),

                                   CI->getFunctionType()->getReturnType(),

                                   CI->getFunctionType()->params(), CostKind);

    };

    auto GetVectorCost = [=](InstructionCost CommonCost) {

      auto *CI = cast<CallInst>(VL0);

      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

      SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(

          CI, ID, VecTy->getNumElements(),

          It != MinBWs.end() ? It->second.first : 0, TTI);

      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);

      return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;

    };

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case Instruction::ShuffleVector: {

    if (!SLPReVec || E->isAltShuffle())

      assert(E->isAltShuffle() &&

             ((Instruction::isBinaryOp(E->getOpcode()) &&

               Instruction::isBinaryOp(E->getAltOpcode())) ||

              (Instruction::isCast(E->getOpcode()) &&

               Instruction::isCast(E->getAltOpcode())) ||

              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

             "Invalid Shuffle Vector Operand");

    // Try to find the previous shuffle node with the same operands and same

    // main/alternate ops.

    auto TryFindNodeWithEqualOperands = [=]() {

      for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

        if (TE.get() == E)

          break;

        if (TE->hasState() && TE->isAltShuffle() &&

            ((TE->getOpcode() == E->getOpcode() &&

              TE->getAltOpcode() == E->getAltOpcode()) ||

             (TE->getOpcode() == E->getAltOpcode() &&

              TE->getAltOpcode() == E->getOpcode())) &&

            TE->hasEqualOperands(*E))

          return true;

      }

      return false;

    };

    auto GetScalarCost = [&](unsigned Idx) {

      if (isa<PoisonValue>(UniqueValues[Idx]))

        return InstructionCost(TTI::TCC_Free);


      auto *VI = cast<Instruction>(UniqueValues[Idx]);

      assert(E->getMatchingMainOpOrAltOp(VI) &&

             "Unexpected main/alternate opcode");

      (void)E;

      return TTI->getInstructionCost(VI, CostKind);

    };

    // Need to clear CommonCost since the final shuffle cost is included into

    // vector cost.

    auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {

      // VecCost is equal to sum of the cost of creating 2 vectors

      // and the cost of creating shuffle.

      InstructionCost VecCost = 0;

      if (TryFindNodeWithEqualOperands()) {

        LLVM_DEBUG({

          dbgs() << "SLP: diamond match for alternate node found.\n";

          E->dump();

        });

        // No need to add new vector costs here since we're going to reuse

        // same main/alternate vector ops, just do different shuffling.

      } else if (Instruction::isBinaryOp(E->getOpcode())) {

        VecCost =

            TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);

        VecCost +=

            TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);

      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

        auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());

        VecCost = TTIRef.getCmpSelInstrCost(

            E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,

            {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},

            VL0);

        VecCost += TTIRef.getCmpSelInstrCost(

            E->getOpcode(), VecTy, MaskTy,

            cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,

            {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},

            E->getAltOp());

      } else {

        Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();

        auto *SrcTy = getWidenedType(SrcSclTy, VL.size());

        if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {

          auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

          unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

          unsigned SrcBWSz =

              DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());

          if (SrcIt != MinBWs.end()) {

            SrcBWSz = SrcIt->second.first;

            SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);

            SrcTy = getWidenedType(SrcSclTy, VL.size());

          }

          if (BWSz <= SrcBWSz) {

            if (BWSz < SrcBWSz)

              VecCost =

                  TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,

                                          TTI::CastContextHint::None, CostKind);

            LLVM_DEBUG({

              dbgs()

                  << "SLP: alternate extension, which should be truncated.\n";

              E->dump();

            });

            return VecCost;

          }

        }

        VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,

                                          TTI::CastContextHint::None, CostKind);

        VecCost +=

            TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,

                                    TTI::CastContextHint::None, CostKind);

      }

      SmallVector<int> Mask;

      E->buildAltOpShuffleMask(

          [&](Instruction *I) {

            assert(E->getMatchingMainOpOrAltOp(I) &&

                   "Unexpected main/alternate opcode");

            return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

                                          *TLI);

          },

          Mask);

      VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,

                                  FinalVecTy, Mask, CostKind);

      // Patterns like [fadd,fsub] can be combined into a single instruction

      // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we

      // need to take into account their order when looking for the most used

      // order.

      unsigned Opcode0 = E->getOpcode();

      unsigned Opcode1 = E->getAltOpcode();

      SmallBitVector OpcodeMask(

          getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));

      // If this pattern is supported by the target then we consider the

      // order.

      if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

        InstructionCost AltVecCost = TTIRef.getAltInstrCost(

            VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);

        return AltVecCost < VecCost ? AltVecCost : VecCost;

      }

      // TODO: Check the reverse order too.

      return VecCost;

    };

    if (SLPReVec && !E->isAltShuffle())

      return GetCostDiff(

          GetScalarCost, [&](InstructionCost) -> InstructionCost {

            // If a group uses mask in order, the shufflevector can be

            // eliminated by instcombine. Then the cost is 0.

            assert(isa<ShuffleVectorInst>(VL.front()) &&

                   "Not supported shufflevector usage.");

            auto *SV = cast<ShuffleVectorInst>(VL.front());

            unsigned SVNumElements =

                cast<FixedVectorType>(SV->getOperand(0)->getType())

                    ->getNumElements();

            unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();

            for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {

              ArrayRef<Value *> Group = VL.slice(I, GroupSize);

              int NextIndex = 0;

              if (!all_of(Group, [&](Value *V) {

                    assert(isa<ShuffleVectorInst>(V) &&

                           "Not supported shufflevector usage.");

                    auto *SV = cast<ShuffleVectorInst>(V);

                    int Index;

                    [[maybe_unused]] bool IsExtractSubvectorMask =

                        SV->isExtractSubvectorMask(Index);

                    assert(IsExtractSubvectorMask &&

                           "Not supported shufflevector usage.");

                    if (NextIndex != Index)

                      return false;

                    NextIndex += SV->getShuffleMask().size();

                    return true;

                  }))

                return ::getShuffleCost(

                    *TTI, TargetTransformInfo::SK_PermuteSingleSrc, VecTy,

                    calculateShufflevectorMask(E->Scalars));

            }

            return TTI::TCC_Free;

          });

    return GetCostDiff(GetScalarCost, GetVectorCost);

  }

  case Instruction::Freeze:

    return CommonCost;

  default:

    llvm_unreachable("Unknown instruction");

  }

}


bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {

  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "

                    << VectorizableTree.size() << " is fully vectorizable .\n");


  auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {

    SmallVector<int> Mask;

    return TE->isGather() &&

           !any_of(TE->Scalars,

                   [this](Value *V) { return EphValues.contains(V); }) &&

           (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||

            TE->Scalars.size() < Limit ||

            (((TE->hasState() &&

               TE->getOpcode() == Instruction::ExtractElement) ||

              all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&

             isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||

            (TE->hasState() && TE->getOpcode() == Instruction::Load &&

             !TE->isAltShuffle()) ||

            any_of(TE->Scalars, IsaPred<LoadInst>));

  };


  // We only handle trees of heights 1 and 2.

  if (VectorizableTree.size() == 1 &&

      (VectorizableTree[0]->State == TreeEntry::Vectorize ||

       VectorizableTree[0]->State == TreeEntry::StridedVectorize ||

       VectorizableTree[0]->State == TreeEntry::CompressVectorize ||

       (ForReduction &&

        AreVectorizableGathers(VectorizableTree[0].get(),

                               VectorizableTree[0]->Scalars.size()) &&

        VectorizableTree[0]->getVectorFactor() > 2)))

    return true;


  if (VectorizableTree.size() != 2)

    return false;


  // Handle splat and all-constants stores. Also try to vectorize tiny trees

  // with the second gather nodes if they have less scalar operands rather than

  // the initial tree element (may be profitable to shuffle the second gather)

  // or they are extractelements, which form shuffle.

  if (VectorizableTree[0]->State == TreeEntry::Vectorize &&

      AreVectorizableGathers(VectorizableTree[1].get(),

                             VectorizableTree[0]->Scalars.size()))

    return true;


  // Gathering cost would be too much for tiny trees.

  if (VectorizableTree[0]->isGather() ||

      (VectorizableTree[1]->isGather() &&

       VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&

       VectorizableTree[0]->State != TreeEntry::StridedVectorize &&

       VectorizableTree[0]->State != TreeEntry::CompressVectorize))

    return false;


  return true;

}


static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,

                                       TargetTransformInfo *TTI,

                                       bool MustMatchOrInst) {

  // Look past the root to find a source value. Arbitrarily follow the

  // path through operand 0 of any 'or'. Also, peek through optional

  // shift-left-by-multiple-of-8-bits.

  Value *ZextLoad = Root;

  const APInt *ShAmtC;

  bool FoundOr = false;

  while (!isa<ConstantExpr>(ZextLoad) &&

         (match(ZextLoad, m_Or(m_Value(), m_Value())) ||

          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&

           ShAmtC->urem(8) == 0))) {

    auto *BinOp = cast<BinaryOperator>(ZextLoad);

    ZextLoad = BinOp->getOperand(0);

    if (BinOp->getOpcode() == Instruction::Or)

      FoundOr = true;

  }

  // Check if the input is an extended load of the required or/shift expression.

  Value *Load;

  if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||

      !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))

    return false;


  // Require that the total load bit width is a legal integer type.

  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.

  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.

  Type *SrcTy = Load->getType();

  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;

  if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))

    return false;


  // Everything matched - assume that we can fold the whole sequence using

  // load combining.

  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "

             << *(cast<Instruction>(Root)) << "\n");


  return true;

}


bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {

  if (RdxKind != RecurKind::Or)

    return false;


  unsigned NumElts = VectorizableTree[0]->Scalars.size();

  Value *FirstReduced = VectorizableTree[0]->Scalars[0];

  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,

                                    /* MatchOr */ false);

}


bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {

  // Peek through a final sequence of stores and check if all operations are

  // likely to be load-combined.

  unsigned NumElts = Stores.size();

  for (Value *Scalar : Stores) {

    Value *X;

    if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||

        !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))

      return false;

  }

  return true;

}


bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {

  if (!DebugCounter::shouldExecute(VectorizedGraphs))

    return true;


  // Graph is empty - do nothing.

  if (VectorizableTree.empty()) {

    assert(ExternalUses.empty() && "We shouldn't have any external users");


    return true;

  }


  // No need to vectorize inserts of gathered values.

  if (VectorizableTree.size() == 2 &&

      isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&

      VectorizableTree[1]->isGather() &&

      (VectorizableTree[1]->getVectorFactor() <= 2 ||

       !(isSplat(VectorizableTree[1]->Scalars) ||

         allConstant(VectorizableTree[1]->Scalars))))

    return true;


  // If the graph includes only PHI nodes and gathers, it is defnitely not

  // profitable for the vectorization, we can skip it, if the cost threshold is

  // default. The cost of vectorized PHI nodes is almost always 0 + the cost of

  // gathers/buildvectors.

  constexpr int Limit = 4;

  if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&

      !VectorizableTree.empty() &&

      all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

        return (TE->isGather() &&

                (!TE->hasState() ||

                 TE->getOpcode() != Instruction::ExtractElement) &&

                count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||

               (TE->hasState() && TE->getOpcode() == Instruction::PHI);

      }))

    return true;


  // Do not vectorize small tree of phis only, if all vector phis are also

  // gathered.

  if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&

      VectorizableTree.size() <= Limit &&

      all_of(VectorizableTree,

             [&](const std::unique_ptr<TreeEntry> &TE) {

               return (TE->isGather() &&

                       (!TE->hasState() ||

                        TE->getOpcode() != Instruction::ExtractElement) &&

                       count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=

                           Limit) ||

                      (TE->hasState() &&

                       (TE->getOpcode() == Instruction::InsertElement ||

                        (TE->getOpcode() == Instruction::PHI &&

                         all_of(TE->Scalars, [&](Value *V) {

                           return isa<PoisonValue>(V) || MustGather.contains(V);

                         }))));

             }) &&

      any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

        return TE->State == TreeEntry::Vectorize &&

               TE->getOpcode() == Instruction::PHI;

      }))

    return true;


  // If the tree contains only phis, buildvectors, split nodes and

  // small nodes with reuses, we can skip it.

  SmallVector<const TreeEntry *> StoreLoadNodes;

  unsigned NumGathers = 0;

  constexpr int LimitTreeSize = 36;

  if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&

      all_of(VectorizableTree,

             [&](const std::unique_ptr<TreeEntry> &TE) {

               if (!TE->isGather() && TE->hasState() &&

                   (TE->getOpcode() == Instruction::Load ||

                    TE->getOpcode() == Instruction::Store)) {

                 StoreLoadNodes.push_back(TE.get());

                 return true;

               }

               if (TE->isGather())

                 ++NumGathers;

               return TE->State == TreeEntry::SplitVectorize ||

                      (TE->Idx == 0 && TE->Scalars.size() == 2 &&

                       TE->hasState() && TE->getOpcode() == Instruction::ICmp &&

                       VectorizableTree.size() > LimitTreeSize) ||

                      (TE->isGather() &&

                       none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||

                      (TE->hasState() &&

                       (TE->getOpcode() == Instruction::PHI ||

                        (TE->hasCopyableElements() &&

                         static_cast<unsigned>(count_if(

                             TE->Scalars, IsaPred<PHINode, Constant>)) >=

                             TE->Scalars.size() / 2) ||

                        ((!TE->ReuseShuffleIndices.empty() ||

                          !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&

                         TE->Scalars.size() == 2)));

             }) &&

      (StoreLoadNodes.empty() ||

       (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&

        (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {

           return TE->getOpcode() == Instruction::Store ||

                  all_of(TE->Scalars, [&](Value *V) {

                    return !isa<LoadInst>(V) ||

                           areAllUsersVectorized(cast<Instruction>(V));

                  });

         })))))

    return true;


  // If the tree contains only buildvector, 2 non-buildvectors (with root user

  // tree node) and other buildvectors, we can skip it.

  if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&

      VectorizableTree.front()->State == TreeEntry::SplitVectorize &&

      VectorizableTree.size() >= Limit &&

      count_if(ArrayRef(VectorizableTree).drop_front(),

               [&](const std::unique_ptr<TreeEntry> &TE) {

                 return !TE->isGather() && TE->UserTreeIndex.UserTE &&

                        TE->UserTreeIndex.UserTE->Idx == 0;

               }) == 2)

    return true;


  // If the tree contains only vectorization of the phi node from the

  // buildvector - skip it.

  if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&

      VectorizableTree.size() > 2 &&

      VectorizableTree.front()->State == TreeEntry::Vectorize &&

      VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&

      VectorizableTree[1]->State == TreeEntry::Vectorize &&

      VectorizableTree[1]->getOpcode() == Instruction::PHI &&

      all_of(

          ArrayRef(VectorizableTree).drop_front(2),

          [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))

    return true;


  // We can vectorize the tree if its size is greater than or equal to the

  // minimum size specified by the MinTreeSize command line option.

  if (VectorizableTree.size() >= MinTreeSize)

    return false;


  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we

  // can vectorize it if we can prove it fully vectorizable.

  if (isFullyVectorizableTinyTree(ForReduction))

    return false;


  // Check if any of the gather node forms an insertelement buildvector

  // somewhere.

  bool IsAllowedSingleBVNode =

      VectorizableTree.size() > 1 ||

      (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&

       !VectorizableTree.front()->isAltShuffle() &&

       VectorizableTree.front()->getOpcode() != Instruction::PHI &&

       VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&

       allSameBlock(VectorizableTree.front()->Scalars));

  if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

        return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {

                 return isa<ExtractElementInst, Constant>(V) ||

                        (IsAllowedSingleBVNode &&

                         !V->hasNUsesOrMore(UsesLimit) &&

                         any_of(V->users(), IsaPred<InsertElementInst>));

               });

      }))

    return false;


  if (VectorizableTree.back()->isGather() &&

      VectorizableTree.back()->hasState() &&

      VectorizableTree.back()->isAltShuffle() &&

      VectorizableTree.back()->getVectorFactor() > 2 &&

      allSameBlock(VectorizableTree.back()->Scalars) &&

      !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&

      TTI->getScalarizationOverhead(

          getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),

                         VectorizableTree.back()->getVectorFactor()),

          APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),

          /*Insert=*/true, /*Extract=*/false,

          TTI::TCK_RecipThroughput) > -SLPCostThreshold)

    return false;


  // Otherwise, we can't vectorize the tree. It is both tiny and not fully

  // vectorizable.

  return true;

}


bool BoUpSLP::isTreeNotExtendable() const {

  if (getCanonicalGraphSize() != getTreeSize()) {

    constexpr unsigned SmallTree = 3;

    if (VectorizableTree.front()->isNonPowOf2Vec() &&

        getCanonicalGraphSize() <= SmallTree &&

        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),

                 [](const std::unique_ptr<TreeEntry> &TE) {

                   return TE->isGather() && TE->hasState() &&

                          TE->getOpcode() == Instruction::Load &&

                          !allSameBlock(TE->Scalars);

                 }) == 1)

      return true;

    return false;

  }

  bool Res = false;

  for (unsigned Idx : seq<unsigned>(getTreeSize())) {

    TreeEntry &E = *VectorizableTree[Idx];

    if (E.State == TreeEntry::SplitVectorize)

      return false;

    if (!E.isGather())

      continue;

    if ((E.hasState() && E.getOpcode() != Instruction::Load) ||

        (!E.hasState() &&

         all_of(E.Scalars, IsaPred<ExtractElementInst, LoadInst>)) ||

        (isa<ExtractElementInst>(E.Scalars.front()) &&

         getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))

      return false;

    if (isSplat(E.Scalars) || allConstant(E.Scalars))

      continue;

    Res = true;

  }

  return Res;

}


InstructionCost BoUpSLP::getSpillCost() {

  // Walk from the bottom of the tree to the top, tracking which values are

  // live. When we see a call instruction that is not part of our tree,

  // query TTI to see if there is a cost to keeping values live over it

  // (for example, if spills and fills are required).


  const TreeEntry *Root = VectorizableTree.front().get();

  if (Root->isGather())

    return 0;


  InstructionCost Cost = 0;

  SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>

      EntriesToOperands;

  SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;

  SmallPtrSet<const Instruction *, 8> LastInstructions;

  for (const auto &TEPtr : VectorizableTree) {

    if (!TEPtr->isGather()) {

      Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());

      EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);

      LastInstructions.insert(LastInst);

    }

    if (TEPtr->UserTreeIndex)

      EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());

  }


  auto NoCallIntrinsic = [this](const Instruction *I) {

    const auto *II = dyn_cast<IntrinsicInst>(I);

    if (!II)

      return false;

    if (II->isAssumeLikeIntrinsic())

      return true;

    IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);

    InstructionCost IntrCost =

        TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);

    InstructionCost CallCost = TTI->getCallInstrCost(

        nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);

    return IntrCost < CallCost;

  };


  // Maps last instruction in the entry to the last instruction for the one of

  // operand entries and the flag. If the flag is true, there are no calls in

  // between these instructions.

  SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>

      CheckedInstructions;

  unsigned Budget = 0;

  const unsigned BudgetLimit =

      ScheduleRegionSizeBudget / VectorizableTree.size();

  auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,

                                            const Instruction *Last) {

    assert(First->getParent() == Last->getParent() &&

           "Expected instructions in same block.");

    if (auto It = CheckedInstructions.find(Last);

        It != CheckedInstructions.end()) {

      const Instruction *Checked = It->second.getPointer();

      if (Checked == First || Checked->comesBefore(First))

        return It->second.getInt() != 0;

      Last = Checked;

    } else if (Last == First || Last->comesBefore(First)) {

      return true;

    }

    BasicBlock::const_reverse_iterator InstIt =

                                           ++First->getIterator().getReverse(),

                                       PrevInstIt =

                                           Last->getIterator().getReverse();

    SmallVector<const Instruction *> LastInstsInRange;

    while (InstIt != PrevInstIt && Budget <= BudgetLimit) {

      // Debug information does not impact spill cost.

      // Vectorized calls, represented as vector intrinsics, do not impact spill

      // cost.

      if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);

          CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {

        for (const Instruction *LastInst : LastInstsInRange)

          CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);

        return false;

      }

      if (LastInstructions.contains(&*PrevInstIt))

        LastInstsInRange.push_back(&*PrevInstIt);


      ++PrevInstIt;

      ++Budget;

    }

    for (const Instruction *LastInst : LastInstsInRange)

      CheckedInstructions.try_emplace(

          LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,

          Budget <= BudgetLimit ? 1 : 0);

    return Budget <= BudgetLimit;

  };

  auto AddCosts = [&](const TreeEntry *Op) {

    Type *ScalarTy = Op->Scalars.front()->getType();

    auto It = MinBWs.find(Op);

    if (It != MinBWs.end())

      ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);

    auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());

    Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);

    if (ScalarTy->isVectorTy()) {

      // Handle revec dead vector instructions.

      Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);

    }

  };

  // Memoize the relationship between blocks, i.e. if there is (at least one)

  // non-vectorized call between the blocks. This allows to skip the analysis of

  // the same block paths multiple times.

  SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>

      ParentOpParentToPreds;

  auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,

                               BasicBlock *OpParent) {

    auto Key = std::make_pair(Root, OpParent);

    if (auto It = ParentOpParentToPreds.find(Key);

        It != ParentOpParentToPreds.end())

      return It->second;

    SmallVector<BasicBlock *> Worklist;

    if (Pred)

      Worklist.push_back(Pred);

    else

      Worklist.append(pred_begin(Root), pred_end(Root));

    SmallPtrSet<const BasicBlock *, 16> Visited;

    SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>

        ParentsPairsToAdd;

    bool Res = false;

    auto Cleanup = make_scope_exit([&]() {

      for (const auto &KeyPair : ParentsPairsToAdd) {

        assert(!ParentOpParentToPreds.contains(KeyPair) &&

               "Should not have been added before.");

        ParentOpParentToPreds.try_emplace(KeyPair, Res);

      }

    });

    while (!Worklist.empty()) {

      BasicBlock *BB = Worklist.pop_back_val();

      if (BB == OpParent || !Visited.insert(BB).second)

        continue;

      auto Pair = std::make_pair(BB, OpParent);

      if (auto It = ParentOpParentToPreds.find(Pair);

          It != ParentOpParentToPreds.end()) {

        Res = It->second;

        return Res;

      }

      ParentsPairsToAdd.insert(Pair);

      unsigned BlockSize = BB->size();

      if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))

        return Res;

      Budget += BlockSize;

      if (Budget > BudgetLimit)

        return Res;

      if (!isa<CatchSwitchInst>(BB->getTerminator()) &&

          !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),

                                          BB->getTerminator()))

        return Res;

      Worklist.append(pred_begin(BB), pred_end(BB));

    }

    Res = true;

    return Res;

  };

  SmallVector<const TreeEntry *> LiveEntries(1, Root);

  while (!LiveEntries.empty()) {

    const TreeEntry *Entry = LiveEntries.pop_back_val();

    SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);

    if (Operands.empty())

      continue;

    Instruction *LastInst = EntriesToLastInstruction.at(Entry);

    BasicBlock *Parent = LastInst->getParent();

    for (const TreeEntry *Op : Operands) {

      if (!Op->isGather())

        LiveEntries.push_back(Op);

      if (Entry->State == TreeEntry::SplitVectorize ||

          (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||

          (Op->isGather() && allConstant(Op->Scalars)))

        continue;

      Budget = 0;

      BasicBlock *Pred = nullptr;

      if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))

        Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);

      BasicBlock *OpParent;

      Instruction *OpLastInst;

      if (Op->isGather()) {

        assert(Entry->getOpcode() == Instruction::PHI &&

               "Expected phi node only.");

        OpParent = cast<PHINode>(Entry->getMainOp())

                       ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);

        OpLastInst = OpParent->getTerminator();

        for (Value *V : Op->Scalars) {

          auto *Inst = dyn_cast<Instruction>(V);

          if (!Inst)

            continue;

          if (isVectorized(V)) {

            OpParent = Inst->getParent();

            OpLastInst = Inst;

            break;

          }

        }

      } else {

        OpLastInst = EntriesToLastInstruction.at(Op);

        OpParent = OpLastInst->getParent();

      }

      // Check the call instructions within the same basic blocks.

      if (OpParent == Parent) {

        if (Entry->getOpcode() == Instruction::PHI) {

          if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))

            AddCosts(Op);

          continue;

        }

        if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))

          AddCosts(Op);

        continue;

      }

      // Check for call instruction in between blocks.

      // 1. Check entry's block to the head.

      if (Entry->getOpcode() != Instruction::PHI &&

          !CheckForNonVecCallsInSameBlock(

              &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),

              LastInst)) {

        AddCosts(Op);

        continue;

      }

      // 2. Check op's block from the end.

      if (!CheckForNonVecCallsInSameBlock(OpLastInst,

                                          OpParent->getTerminator())) {

        AddCosts(Op);

        continue;

      }

      // 3. Check the predecessors of entry's block till op's block.

      if (!CheckPredecessors(Parent, Pred, OpParent)) {

        AddCosts(Op);

        continue;

      }

    }

  }


  return Cost;

}


/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the

/// buildvector sequence.

static bool isFirstInsertElement(const InsertElementInst *IE1,

                                 const InsertElementInst *IE2) {

  if (IE1 == IE2)

    return false;

  const auto *I1 = IE1;

  const auto *I2 = IE2;

  const InsertElementInst *PrevI1;

  const InsertElementInst *PrevI2;

  unsigned Idx1 = *getElementIndex(IE1);

  unsigned Idx2 = *getElementIndex(IE2);

  do {

    if (I2 == IE1)

      return true;

    if (I1 == IE2)

      return false;

    PrevI1 = I1;

    PrevI2 = I2;

    if (I1 && (I1 == IE1 || I1->hasOneUse()) &&

        getElementIndex(I1).value_or(Idx2) != Idx2)

      I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));

    if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&

        getElementIndex(I2).value_or(Idx1) != Idx1)

      I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));

  } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));

  llvm_unreachable("Two different buildvectors not expected.");

}


namespace {

/// Returns incoming Value *, if the requested type is Value * too, or a default

/// value, otherwise.

struct ValueSelect {

  template <typename U>

  static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {

    return V;

  }

  template <typename U>

  static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {

    return U();

  }

};

} // namespace


/// Does the analysis of the provided shuffle masks and performs the requested

/// actions on the vectors with the given shuffle masks. It tries to do it in

/// several steps.

/// 1. If the Base vector is not undef vector, resizing the very first mask to

/// have common VF and perform action for 2 input vectors (including non-undef

/// Base). Other shuffle masks are combined with the resulting after the 1 stage

/// and processed as a shuffle of 2 elements.

/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the

/// action only for 1 vector with the given mask, if it is not the identity

/// mask.

/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2

/// vectors, combing the masks properly between the steps.

template <typename T>

static T *performExtractsShuffleAction(

    MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,

    function_ref<unsigned(T *)> GetVF,

    function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,

    function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {

  assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");

  SmallVector<int> Mask(ShuffleMask.begin()->second);

  auto VMIt = std::next(ShuffleMask.begin());

  T *Prev = nullptr;

  SmallBitVector UseMask =

      buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);

  SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);

  if (!IsBaseUndef.all()) {

    // Base is not undef, need to combine it with the next subvectors.

    std::pair<T *, bool> Res =

        ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);

    SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);

    for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {

      if (Mask[Idx] == PoisonMaskElem)

        Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;

      else

        Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;

    }

    [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);

    assert((!V || GetVF(V) == Mask.size()) &&

           "Expected base vector of VF number of elements.");

    Prev = Action(Mask, {nullptr, Res.first});

  } else if (ShuffleMask.size() == 1) {

    // Base is undef and only 1 vector is shuffled - perform the action only for

    // single vector, if the mask is not the identity mask.

    std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,

                                            /*ForSingleMask=*/true);

    if (Res.second)

      // Identity mask is found.

      Prev = Res.first;

    else

      Prev = Action(Mask, {ShuffleMask.begin()->first});

  } else {

    // Base is undef and at least 2 input vectors shuffled - perform 2 vectors

    // shuffles step by step, combining shuffle between the steps.

    unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);

    unsigned Vec2VF = GetVF(VMIt->first);

    if (Vec1VF == Vec2VF) {

      // No need to resize the input vectors since they are of the same size, we

      // can shuffle them directly.

      ArrayRef<int> SecMask = VMIt->second;

      for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

        if (SecMask[I] != PoisonMaskElem) {

          assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");

          Mask[I] = SecMask[I] + Vec1VF;

        }

      }

      Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});

    } else {

      // Vectors of different sizes - resize and reshuffle.

      std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,

                                               /*ForSingleMask=*/false);

      std::pair<T *, bool> Res2 =

          ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);

      ArrayRef<int> SecMask = VMIt->second;

      for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

        if (Mask[I] != PoisonMaskElem) {

          assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");

          if (Res1.second)

            Mask[I] = I;

        } else if (SecMask[I] != PoisonMaskElem) {

          assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");

          Mask[I] = (Res2.second ? I : SecMask[I]) + VF;

        }

      }

      Prev = Action(Mask, {Res1.first, Res2.first});

    }

    VMIt = std::next(VMIt);

  }

  [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();

  // Perform requested actions for the remaining masks/vectors.

  for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {

    // Shuffle other input vectors, if any.

    std::pair<T *, bool> Res =

        ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);

    ArrayRef<int> SecMask = VMIt->second;

    for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

      if (SecMask[I] != PoisonMaskElem) {

        assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&

               "Multiple uses of scalars.");

        Mask[I] = (Res.second ? I : SecMask[I]) + VF;

      } else if (Mask[I] != PoisonMaskElem) {

        Mask[I] = I;

      }

    }

    Prev = Action(Mask, {Prev, Res.first});

  }

  return Prev;

}


namespace {

/// Data type for handling buildvector sequences with the reused scalars from

/// other tree entries.

template <typename T> struct ShuffledInsertData {

  /// List of insertelements to be replaced by shuffles.

  SmallVector<InsertElementInst *> InsertElements;

  /// The parent vectors and shuffle mask for the given list of inserts.

  MapVector<T, SmallVector<int>> ValueMasks;

};

} // namespace


InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,

                                     InstructionCost ReductionCost) {

  InstructionCost Cost = ReductionCost;

  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "

                    << VectorizableTree.size() << ".\n");


  SmallPtrSet<Value *, 4> CheckedExtracts;

  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {

    TreeEntry &TE = *VectorizableTree[I];

    // No need to count the cost for combined entries, they are combined and

    // just skip their cost.

    if (TE.State == TreeEntry::CombinedVectorize) {

      LLVM_DEBUG(

          dbgs() << "SLP: Skipping cost for combined node that starts with "

                 << *TE.Scalars[0] << ".\n";

          TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");

      continue;

    }

    if (TE.hasState() &&

        (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {

      if (const TreeEntry *E =

              getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);

          E && E->getVectorFactor() == TE.getVectorFactor()) {

        // Some gather nodes might be absolutely the same as some vectorizable

        // nodes after reordering, need to handle it.

        LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "

                          << shortBundleName(TE.Scalars, TE.Idx) << ".\n"

                          << "SLP: Current total cost = " << Cost << "\n");

        continue;

      }

    }


    // Exclude cost of gather loads nodes which are not used. These nodes were

    // built as part of the final attempt to vectorize gathered loads.

    assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&

           "Expected gather nodes with users only.");


    InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);

    Cost += C;

    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "

                      << shortBundleName(TE.Scalars, TE.Idx) << ".\n"

                      << "SLP: Current total cost = " << Cost << "\n");

  }


  if (Cost >= -SLPCostThreshold &&

      none_of(ExternalUses, [](const ExternalUser &EU) {

        return isa_and_nonnull<InsertElementInst>(EU.User);

      }))

    return Cost;


  SmallPtrSet<Value *, 16> ExtractCostCalculated;

  InstructionCost ExtractCost = 0;

  SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;

  SmallVector<APInt> DemandedElts;

  SmallDenseSet<Value *, 4> UsedInserts;

  DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;

  std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;

  DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;

  SmallPtrSet<Value *, 4> ScalarOpsFromCasts;

  // Keep track {Scalar, Index, User} tuple.

  // On AArch64, this helps in fusing a mov instruction, associated with

  // extractelement, with fmul in the backend so that extractelement is free.

  SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;

  for (ExternalUser &EU : ExternalUses) {

    ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);

  }

  SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;

  for (ExternalUser &EU : ExternalUses) {

    LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "

                      << EU.E.Idx << " in lane " << EU.Lane << "\n");

    LLVM_DEBUG(if (EU.User) dbgs() << "  User:" << *EU.User << "\n";

               else dbgs() << "  User: nullptr\n");

    LLVM_DEBUG(dbgs() << "  Use: " << EU.Scalar->getNameOrAsOperand() << "\n");


    // Uses by ephemeral values are free (because the ephemeral value will be

    // removed prior to code generation, and so the extraction will be

    // removed as well).

    if (EphValues.count(EU.User))

      continue;


    // Check if the scalar for the given user or all users is accounted already.

    if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||

        (EU.User &&

         CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))

      continue;


    // Used in unreachable blocks or in EH pads (rarely executed) or is

    // terminated with unreachable instruction.

    if (BasicBlock *UserParent =

            EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;

        UserParent &&

        (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||

         isa_and_present<UnreachableInst>(UserParent->getTerminator())))

      continue;


    // We only add extract cost once for the same scalar.

    if (!isa_and_nonnull<InsertElementInst>(EU.User) &&

        !ExtractCostCalculated.insert(EU.Scalar).second)

      continue;


    // No extract cost for vector "scalar" if REVEC is disabled

    if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))

      continue;


    // If found user is an insertelement, do not calculate extract cost but try

    // to detect it as a final shuffled/identity match.

    // TODO: what if a user is insertvalue when REVEC is enabled?

    if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);

        VU && VU->getOperand(1) == EU.Scalar) {

      if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

        if (!UsedInserts.insert(VU).second)

          continue;

        std::optional<unsigned> InsertIdx = getElementIndex(VU);

        if (InsertIdx) {

          const TreeEntry *ScalarTE = &EU.E;

          auto *It = find_if(

              ShuffledInserts,

              [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {

                // Checks if 2 insertelements are from the same buildvector.

                InsertElementInst *VecInsert = Data.InsertElements.front();

                return areTwoInsertFromSameBuildVector(

                    VU, VecInsert, [this](InsertElementInst *II) -> Value * {

                      Value *Op0 = II->getOperand(0);

                      if (isVectorized(II) && !isVectorized(Op0))

                        return nullptr;

                      return Op0;

                    });

              });

          int VecId = -1;

          if (It == ShuffledInserts.end()) {

            auto &Data = ShuffledInserts.emplace_back();

            Data.InsertElements.emplace_back(VU);

            DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));

            VecId = ShuffledInserts.size() - 1;

            auto It = MinBWs.find(ScalarTE);

            if (It != MinBWs.end() &&

                VectorCasts

                    .insert(std::make_pair(ScalarTE, FTy->getElementType()))

                    .second) {

              unsigned BWSz = It->second.first;

              unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());

              unsigned VecOpcode;

              if (DstBWSz < BWSz)

                VecOpcode = Instruction::Trunc;

              else

                VecOpcode =

                    It->second.second ? Instruction::SExt : Instruction::ZExt;

              TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

              InstructionCost C = TTI->getCastInstrCost(

                  VecOpcode, FTy,

                  getWidenedType(IntegerType::get(FTy->getContext(), BWSz),

                                 FTy->getNumElements()),

                  TTI::CastContextHint::None, CostKind);

              LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

                                << " for extending externally used vector with "

                                   "non-equal minimum bitwidth.\n");

              Cost += C;

            }

          } else {

            if (isFirstInsertElement(VU, It->InsertElements.front()))

              It->InsertElements.front() = VU;

            VecId = std::distance(ShuffledInserts.begin(), It);

          }

          int InIdx = *InsertIdx;

          SmallVectorImpl<int> &Mask =

              ShuffledInserts[VecId].ValueMasks[ScalarTE];

          if (Mask.empty())

            Mask.assign(FTy->getNumElements(), PoisonMaskElem);

          Mask[InIdx] = EU.Lane;

          DemandedElts[VecId].setBit(InIdx);

          continue;

        }

      }

    }


    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

    // If we plan to rewrite the tree in a smaller type, we will need to sign

    // extend the extracted value back to the original type. Here, we account

    // for the extract and the added cost of the sign extend if needed.

    InstructionCost ExtraCost = TTI::TCC_Free;

    auto *ScalarTy = EU.Scalar->getType();

    const unsigned BundleWidth = EU.E.getVectorFactor();

    assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");

    auto *VecTy = getWidenedType(ScalarTy, BundleWidth);

    const TreeEntry *Entry = &EU.E;

    auto It = MinBWs.find(Entry);

    if (It != MinBWs.end()) {

      Type *MinTy = IntegerType::get(F->getContext(), It->second.first);

      if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))

        MinTy = getWidenedType(MinTy, VecTy->getNumElements());

      unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))

                            ? Instruction::ZExt

                            : Instruction::SExt;

      VecTy = getWidenedType(MinTy, BundleWidth);

      ExtraCost =

          getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);

      LLVM_DEBUG(dbgs() << "  ExtractExtend or ExtractSubvec cost: "

                        << ExtraCost << "\n");

    } else {

      ExtraCost =

          getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,

                             CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);

      LLVM_DEBUG(dbgs() << "  ExtractElement cost for " << *ScalarTy << " from "

                        << *VecTy << ": " << ExtraCost << "\n");

    }

    // Leave the scalar instructions as is if they are cheaper than extracts.

    if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

        Entry->getOpcode() == Instruction::Load) {

      // Checks if the user of the external scalar is phi in loop body.

      auto IsPhiInLoop = [&](const ExternalUser &U) {

        if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {

          auto *I = cast<Instruction>(U.Scalar);

          const Loop *L = LI->getLoopFor(Phi->getParent());

          return L && (Phi->getParent() == I->getParent() ||

                       L == LI->getLoopFor(I->getParent()));

        }

        return false;

      };

      if (!ValueToExtUses) {

        ValueToExtUses.emplace();

        for (const auto &P : enumerate(ExternalUses)) {

          // Ignore phis in loops.

          if (IsPhiInLoop(P.value()))

            continue;


          ValueToExtUses->try_emplace(P.value().Scalar, P.index());

        }

      }

      // Can use original instruction, if no operands vectorized or they are

      // marked as externally used already.

      auto *Inst = cast<Instruction>(EU.Scalar);

      InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);

      auto OperandIsScalar = [&](Value *V) {

        if (!isVectorized(V)) {

          // Some extractelements might be not vectorized, but

          // transformed into shuffle and removed from the function,

          // consider it here.

          if (auto *EE = dyn_cast<ExtractElementInst>(V))

            return !EE->hasOneUse() || !MustGather.contains(EE);

          return true;

        }

        return ValueToExtUses->contains(V);

      };

      bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);

      bool CanBeUsedAsScalarCast = false;

      if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {

        if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));

            Op && all_of(Op->operands(), OperandIsScalar)) {

          InstructionCost OpCost =

              (isVectorized(Op) && !ValueToExtUses->contains(Op))

                  ? TTI->getInstructionCost(Op, CostKind)

                  : 0;

          if (ScalarCost + OpCost <= ExtraCost) {

            CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;

            ScalarCost += OpCost;

          }

        }

      }

      if (CanBeUsedAsScalar) {

        bool KeepScalar = ScalarCost <= ExtraCost;

        // Try to keep original scalar if the user is the phi node from the same

        // block as the root phis, currently vectorized. It allows to keep

        // better ordering info of PHIs, being vectorized currently.

        bool IsProfitablePHIUser =

            (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&

                            VectorizableTree.front()->Scalars.size() > 2)) &&

            VectorizableTree.front()->hasState() &&

            VectorizableTree.front()->getOpcode() == Instruction::PHI &&

            !Inst->hasNUsesOrMore(UsesLimit) &&

            none_of(Inst->users(),

                    [&](User *U) {

                      auto *PHIUser = dyn_cast<PHINode>(U);

                      return (!PHIUser ||

                              PHIUser->getParent() !=

                                  cast<Instruction>(

                                      VectorizableTree.front()->getMainOp())

                                      ->getParent()) &&

                             !isVectorized(U);

                    }) &&

            count_if(Entry->Scalars, [&](Value *V) {

              return ValueToExtUses->contains(V);

            }) <= 2;

        if (IsProfitablePHIUser) {

          KeepScalar = true;

        } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&

                   ExtraCost - ScalarCost <= TTI::TCC_Basic &&

                   (!GatheredLoadsEntriesFirst.has_value() ||

                    Entry->Idx < *GatheredLoadsEntriesFirst)) {

          unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {

            return ValueToExtUses->contains(V);

          });

          auto It = ExtractsCount.find(Entry);

          if (It != ExtractsCount.end()) {

            assert(ScalarUsesCount >= It->getSecond().size() &&

                   "Expected total number of external uses not less than "

                   "number of scalar uses.");

            ScalarUsesCount -= It->getSecond().size();

          }

          // Keep original scalar if number of externally used instructions in

          // the same entry is not power of 2. It may help to do some extra

          // vectorization for now.

          KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);

        }

        if (KeepScalar) {

          ExternalUsesAsOriginalScalar.insert(EU.Scalar);

          for (Value *V : Inst->operands()) {

            auto It = ValueToExtUses->find(V);

            if (It != ValueToExtUses->end()) {

              // Replace all uses to avoid compiler crash.

              ExternalUses[It->second].User = nullptr;

            }

          }

          ExtraCost = ScalarCost;

          if (!IsPhiInLoop(EU))

            ExtractsCount[Entry].insert(Inst);

          if (CanBeUsedAsScalarCast) {

            ScalarOpsFromCasts.insert(Inst->getOperand(0));

            // Update the users of the operands of the cast operand to avoid

            // compiler crash.

            if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {

              for (Value *V : IOp->operands()) {

                auto It = ValueToExtUses->find(V);

                if (It != ValueToExtUses->end()) {

                  // Replace all uses to avoid compiler crash.

                  ExternalUses[It->second].User = nullptr;

                }

              }

            }

          }

        }

      }

    }


    ExtractCost += ExtraCost;

  }

  // Insert externals for extract of operands of casts to be emitted as scalars

  // instead of extractelement.

  for (Value *V : ScalarOpsFromCasts) {

    ExternalUsesAsOriginalScalar.insert(V);

    if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {

      ExternalUses.emplace_back(V, nullptr, *TEs.front(),

                                TEs.front()->findLaneForValue(V));

    }

  }

  // Add reduced value cost, if resized.

  if (!VectorizedVals.empty()) {

    const TreeEntry &Root = *VectorizableTree.front();

    auto BWIt = MinBWs.find(&Root);

    if (BWIt != MinBWs.end()) {

      Type *DstTy = Root.Scalars.front()->getType();

      unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());

      unsigned SrcSz =

          ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;

      if (OriginalSz != SrcSz) {

        unsigned Opcode = Instruction::Trunc;

        if (OriginalSz > SrcSz)

          Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;

        Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);

        if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {

          assert(SLPReVec && "Only supported by REVEC.");

          SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());

        }

        Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,

                                      TTI::CastContextHint::None,

                                      TTI::TCK_RecipThroughput);

      }

    }

  }


  Cost += ExtractCost;

  auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,

                                    bool ForSingleMask) {

    InstructionCost C = 0;

    unsigned VF = Mask.size();

    unsigned VecVF = TE->getVectorFactor();

    bool HasLargeIndex =

        any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });

    if ((VF != VecVF && HasLargeIndex) ||

        !ShuffleVectorInst::isIdentityMask(Mask, VF)) {


      if (HasLargeIndex) {

        SmallVector<int> OrigMask(VecVF, PoisonMaskElem);

        std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

                  OrigMask.begin());

        C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,

                             getWidenedType(TE->getMainOp()->getType(), VecVF),

                             OrigMask);

        LLVM_DEBUG(

            dbgs() << "SLP: Adding cost " << C

                   << " for final shuffle of insertelement external users.\n";

            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");

        Cost += C;

        return std::make_pair(TE, true);

      }


      if (!ForSingleMask) {

        SmallVector<int> ResizeMask(VF, PoisonMaskElem);

        for (unsigned I = 0; I < VF; ++I) {

          if (Mask[I] != PoisonMaskElem)

            ResizeMask[Mask[I]] = Mask[I];

        }

        if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))

          C = ::getShuffleCost(

              *TTI, TTI::SK_PermuteSingleSrc,

              getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);

        LLVM_DEBUG(

            dbgs() << "SLP: Adding cost " << C

                   << " for final shuffle of insertelement external users.\n";

            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");


        Cost += C;

      }

    }

    return std::make_pair(TE, false);

  };

  // Calculate the cost of the reshuffled vectors, if any.

  for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {

    Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);

    auto Vector = ShuffledInserts[I].ValueMasks.takeVector();

    unsigned VF = 0;

    auto EstimateShufflesCost = [&](ArrayRef<int> Mask,

                                    ArrayRef<const TreeEntry *> TEs) {

      assert((TEs.size() == 1 || TEs.size() == 2) &&

             "Expected exactly 1 or 2 tree entries.");

      if (TEs.size() == 1) {

        if (VF == 0)

          VF = TEs.front()->getVectorFactor();

        auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);

        if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&

            !all_of(enumerate(Mask), [=](const auto &Data) {

              return Data.value() == PoisonMaskElem ||

                     (Data.index() < VF &&

                      static_cast<int>(Data.index()) == Data.value());

            })) {

          InstructionCost C =

              ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask);

          LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

                            << " for final shuffle of insertelement "

                               "external users.\n";

                     TEs.front()->dump();

                     dbgs() << "SLP: Current total cost = " << Cost << "\n");

          Cost += C;

        }

      } else {

        if (VF == 0) {

          if (TEs.front() &&

              TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())

            VF = TEs.front()->getVectorFactor();

          else

            VF = Mask.size();

        }

        auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);

        InstructionCost C =

            ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);

        LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

                          << " for final shuffle of vector node and external "

                             "insertelement users.\n";

                   if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();

                   dbgs() << "SLP: Current total cost = " << Cost << "\n");

        Cost += C;

      }

      VF = Mask.size();

      return TEs.back();

    };

    (void)performExtractsShuffleAction<const TreeEntry>(

        MutableArrayRef(Vector.data(), Vector.size()), Base,

        [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,

        EstimateShufflesCost);

    InstructionCost InsertCost = TTI->getScalarizationOverhead(

        cast<FixedVectorType>(

            ShuffledInserts[I].InsertElements.front()->getType()),

        DemandedElts[I],

        /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);

    Cost -= InsertCost;

  }


  // Add the cost for reduced value resize (if required).

  if (ReductionBitWidth != 0) {

    assert(UserIgnoreList && "Expected reduction tree.");

    const TreeEntry &E = *VectorizableTree.front();

    auto It = MinBWs.find(&E);

    if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {

      unsigned SrcSize = It->second.first;

      unsigned DstSize = ReductionBitWidth;

      unsigned Opcode = Instruction::Trunc;

      if (SrcSize < DstSize) {

        bool IsArithmeticExtendedReduction =

            all_of(*UserIgnoreList, [](Value *V) {

              auto *I = cast<Instruction>(V);

              return is_contained({Instruction::Add, Instruction::FAdd,

                                   Instruction::Mul, Instruction::FMul,

                                   Instruction::And, Instruction::Or,

                                   Instruction::Xor},

                                  I->getOpcode());

            });

        if (IsArithmeticExtendedReduction)

          Opcode =

              Instruction::BitCast; // Handle it by getExtendedReductionCost

        else

          Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

      }

      if (Opcode != Instruction::BitCast) {

        auto *SrcVecTy =

            getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());

        auto *DstVecTy =

            getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());

        TTI::CastContextHint CCH = getCastContextHint(E);

        InstructionCost CastCost;

        switch (E.getOpcode()) {

        case Instruction::SExt:

        case Instruction::ZExt:

        case Instruction::Trunc: {

          const TreeEntry *OpTE = getOperandEntry(&E, 0);

          CCH = getCastContextHint(*OpTE);

          break;

        }

        default:

          break;

        }

        CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,

                                          TTI::TCK_RecipThroughput);

        Cost += CastCost;

        LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost

                          << " for final resize for reduction from " << SrcVecTy

                          << " to " << DstVecTy << "\n";

                   dbgs() << "SLP: Current total cost = " << Cost << "\n");

      }

    }

  }


  std::optional<InstructionCost> SpillCost;

  if (Cost < -SLPCostThreshold) {

    SpillCost = getSpillCost();

    Cost += *SpillCost;

  }

#ifndef NDEBUG

  SmallString<256> Str;

  {

    raw_svector_ostream OS(Str);

    OS << "SLP: Spill Cost = ";

    if (SpillCost)

      OS << *SpillCost;

    else

      OS << "<skipped>";

    OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"

       << "SLP: Total Cost = " << Cost << ".\n";

  }

  LLVM_DEBUG(dbgs() << Str);

  if (ViewSLPTree)

    ViewGraph(this, "SLP" + F->getName(), false, Str);

#endif


  return Cost;

}


/// Tries to find extractelement instructions with constant indices from fixed

/// vector type and gather such instructions into a bunch, which highly likely

/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

/// successful, the matched scalars are replaced by poison values in \p VL for

/// future analysis.

std::optional<TTI::ShuffleKind>

BoUpSLP::tryToGatherSingleRegisterExtractElements(

    MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {

  // Scan list of gathered scalars for extractelements that can be represented

  // as shuffles.

  MapVector<Value *, SmallVector<int>> VectorOpToIdx;

  SmallVector<int> UndefVectorExtracts;

  for (int I = 0, E = VL.size(); I < E; ++I) {

    auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

    if (!EI) {

      if (isa<UndefValue>(VL[I]))

        UndefVectorExtracts.push_back(I);

      continue;

    }

    auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

    if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

      continue;

    std::optional<unsigned> Idx = getExtractIndex(EI);

    // Undefined index.

    if (!Idx) {

      UndefVectorExtracts.push_back(I);

      continue;

    }

    if (Idx >= VecTy->getNumElements()) {

      UndefVectorExtracts.push_back(I);

      continue;

    }

    SmallBitVector ExtractMask(VecTy->getNumElements(), true);

    ExtractMask.reset(*Idx);

    if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

      UndefVectorExtracts.push_back(I);

      continue;

    }

    VectorOpToIdx[EI->getVectorOperand()].push_back(I);

  }

  // Sort the vector operands by the maximum number of uses in extractelements.

  SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =

      VectorOpToIdx.takeVector();

  stable_sort(Vectors, [](const auto &P1, const auto &P2) {

    return P1.second.size() > P2.second.size();

  });

  // Find the best pair of the vectors or a single vector.

  const int UndefSz = UndefVectorExtracts.size();

  unsigned SingleMax = 0;

  unsigned PairMax = 0;

  if (!Vectors.empty()) {

    SingleMax = Vectors.front().second.size() + UndefSz;

    if (Vectors.size() > 1) {

      auto *ItNext = std::next(Vectors.begin());

      PairMax = SingleMax + ItNext->second.size();

    }

  }

  if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

    return std::nullopt;

  // Check if better to perform a shuffle of 2 vectors or just of a single

  // vector.

  SmallVector<Value *> SavedVL(VL.begin(), VL.end());

  SmallVector<Value *> GatheredExtracts(

      VL.size(), PoisonValue::get(VL.front()->getType()));

  if (SingleMax >= PairMax && SingleMax) {

    for (int Idx : Vectors.front().second)

      std::swap(GatheredExtracts[Idx], VL[Idx]);

  } else if (!Vectors.empty()) {

    for (unsigned Idx : {0, 1})

      for (int Idx : Vectors[Idx].second)

        std::swap(GatheredExtracts[Idx], VL[Idx]);

  }

  // Add extracts from undefs too.

  for (int Idx : UndefVectorExtracts)

    std::swap(GatheredExtracts[Idx], VL[Idx]);

  // Check that gather of extractelements can be represented as just a

  // shuffle of a single/two vectors the scalars are extracted from.

  std::optional<TTI::ShuffleKind> Res =

      isFixedVectorShuffle(GatheredExtracts, Mask, AC);

  if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {

    // TODO: try to check other subsets if possible.

    // Restore the original VL if attempt was not successful.

    copy(SavedVL, VL.begin());

    return std::nullopt;

  }

  // Restore unused scalars from mask, if some of the extractelements were not

  // selected for shuffle.

  for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {

    if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&

        isa<UndefValue>(GatheredExtracts[I])) {

      std::swap(VL[I], GatheredExtracts[I]);

      continue;

    }

    auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

    if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

        !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

        is_contained(UndefVectorExtracts, I))

      continue;

  }

  return Res;

}


/// Tries to find extractelement instructions with constant indices from fixed

/// vector type and gather such instructions into a bunch, which highly likely

/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

/// successful, the matched scalars are replaced by poison values in \p VL for

/// future analysis.

SmallVector<std::optional<TTI::ShuffleKind>>

BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

                                    SmallVectorImpl<int> &Mask,

                                    unsigned NumParts) const {

  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");

  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);

  Mask.assign(VL.size(), PoisonMaskElem);

  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);

  for (unsigned Part : seq<unsigned>(NumParts)) {

    // Scan list of gathered scalars for extractelements that can be represented

    // as shuffles.

    MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(

        Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));

    SmallVector<int> SubMask;

    std::optional<TTI::ShuffleKind> Res =

        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);

    ShufflesRes[Part] = Res;

    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));

  }

  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {

        return Res.has_value();

      }))

    ShufflesRes.clear();

  return ShufflesRes;

}


std::optional<TargetTransformInfo::ShuffleKind>

BoUpSLP::isGatherShuffledSingleRegisterEntry(

    const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,

    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {

  Entries.clear();

  // TODO: currently checking only for Scalars in the tree entry, need to count

  // reused elements too for better cost estimation.

  auto GetUserEntry = [&](const TreeEntry *TE) {

    while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)

      TE = TE->UserTreeIndex.UserTE;

    if (TE == VectorizableTree.front().get())

      return EdgeInfo(const_cast<TreeEntry *>(TE), 0);

    return TE->UserTreeIndex;

  };

  auto HasGatherUser = [&](const TreeEntry *TE) {

    while (TE->Idx != 0 && TE->UserTreeIndex) {

      if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)

        return true;

      TE = TE->UserTreeIndex.UserTE;

    }

    return false;

  };

  const EdgeInfo TEUseEI = GetUserEntry(TE);

  if (!TEUseEI)

    return std::nullopt;

  const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);

  const BasicBlock *TEInsertBlock = nullptr;

  // Main node of PHI entries keeps the correct order of operands/incoming

  // blocks.

  if (auto *PHI = dyn_cast_or_null<PHINode>(

          TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);

      PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {

    TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);

    TEInsertPt = TEInsertBlock->getTerminator();

  } else {

    TEInsertBlock = TEInsertPt->getParent();

  }

  if (!DT->isReachableFromEntry(TEInsertBlock))

    return std::nullopt;

  auto *NodeUI = DT->getNode(TEInsertBlock);

  assert(NodeUI && "Should only process reachable instructions");

  SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);

  auto CheckOrdering = [&](const Instruction *InsertPt) {

    // Argument InsertPt is an instruction where vector code for some other

    // tree entry (one that shares one or more scalars with TE) is going to be

    // generated. This lambda returns true if insertion point of vector code

    // for the TE dominates that point (otherwise dependency is the other way

    // around). The other node is not limited to be of a gather kind. Gather

    // nodes are not scheduled and their vector code is inserted before their

    // first user. If user is PHI, that is supposed to be at the end of a

    // predecessor block. Otherwise it is the last instruction among scalars of

    // the user node. So, instead of checking dependency between instructions

    // themselves, we check dependency between their insertion points for vector

    // code (since each scalar instruction ends up as a lane of a vector

    // instruction).

    const BasicBlock *InsertBlock = InsertPt->getParent();

    auto *NodeEUI = DT->getNode(InsertBlock);

    if (!NodeEUI)

      return false;

    assert((NodeUI == NodeEUI) ==

               (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&

           "Different nodes should have different DFS numbers");

    // Check the order of the gather nodes users.

    if (TEInsertPt->getParent() != InsertBlock &&

        (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))

      return false;

    if (TEInsertPt->getParent() == InsertBlock &&

        TEInsertPt->comesBefore(InsertPt))

      return false;

    return true;

  };

  // Find all tree entries used by the gathered values. If no common entries

  // found - not a shuffle.

  // Here we build a set of tree nodes for each gathered value and trying to

  // find the intersection between these sets. If we have at least one common

  // tree node for each gathered value - we have just a permutation of the

  // single vector. If we have 2 different sets, we're in situation where we

  // have a permutation of 2 input vectors.

  SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;

  SmallDenseMap<Value *, int> UsedValuesEntry;

  SmallPtrSet<const Value *, 16> VisitedValue;

  auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {

    // The node is reused - exit.

    if ((TEPtr->getVectorFactor() != VL.size() &&

         TEPtr->Scalars.size() != VL.size()) ||

        (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))

      return false;

    UsedTEs.clear();

    UsedTEs.emplace_back().insert(TEPtr);

    for (Value *V : VL) {

      if (isConstant(V))

        continue;

      UsedValuesEntry.try_emplace(V, 0);

    }

    return true;

  };

  auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,

                              unsigned EdgeIdx) {

    const TreeEntry *Ptr1 = User1;

    const TreeEntry *Ptr2 = User2;

    SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;

    while (Ptr2) {

      PtrToIdx.try_emplace(Ptr2, EdgeIdx);

      EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;

      Ptr2 = Ptr2->UserTreeIndex.UserTE;

    }

    while (Ptr1) {

      unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;

      Ptr1 = Ptr1->UserTreeIndex.UserTE;

      if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())

        return Idx < It->second;

    }

    return false;

  };

  for (Value *V : VL) {

    if (isConstant(V) || !VisitedValue.insert(V).second)

      continue;

    // Build a list of tree entries where V is used.

    SmallPtrSet<const TreeEntry *, 4> VToTEs;

    for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {

      if (TEPtr == TE || TEPtr->Idx == 0)

        continue;

      assert(any_of(TEPtr->Scalars,

                    [&](Value *V) { return GatheredScalars.contains(V); }) &&

             "Must contain at least single gathered value.");

      assert(TEPtr->UserTreeIndex &&

             "Expected only single user of a gather node.");

      const EdgeInfo &UseEI = TEPtr->UserTreeIndex;


      PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&

                          UseEI.UserTE->hasState())

                             ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())

                             : nullptr;

      Instruction *InsertPt =

          UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()

                  : &getLastInstructionInBundle(UseEI.UserTE);

      if (TEInsertPt == InsertPt) {

        // Check nodes, which might be emitted first.

        if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&

            (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||

             TEUseEI.UserTE->isAltShuffle()) &&

            all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {

          if (UseEI.UserTE->State != TreeEntry::Vectorize ||

              (UseEI.UserTE->hasState() &&

               UseEI.UserTE->getOpcode() == Instruction::PHI &&

               !UseEI.UserTE->isAltShuffle()) ||

              !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))

            continue;

        }


        // If the schedulable insertion point is used in multiple entries - just

        // exit, no known ordering at this point, available only after real

        // scheduling.

        if (!doesNotNeedToBeScheduled(InsertPt) &&

            (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))

          continue;

        // If the users are the PHI nodes with the same incoming blocks - skip.

        if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&

            TEUseEI.UserTE->getOpcode() == Instruction::PHI &&

            UseEI.UserTE->State == TreeEntry::Vectorize &&

            UseEI.UserTE->getOpcode() == Instruction::PHI &&

            TEUseEI.UserTE != UseEI.UserTE)

          continue;

        // If 2 gathers are operands of the same entry (regardless of whether

        // user is PHI or else), compare operands indices, use the earlier one

        // as the base.

        if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)

          continue;

        // If the user instruction is used for some reason in different

        // vectorized nodes - make it depend on index.

        if (TEUseEI.UserTE != UseEI.UserTE &&

            (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||

             HasGatherUser(TEUseEI.UserTE)))

          continue;

        // If the user node is the operand of the other user node - skip.

        if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))

          continue;

      }


      if (!TEUseEI.UserTE->isGather() && !UserPHI &&

          TEUseEI.UserTE->doesNotNeedToSchedule() !=

              UseEI.UserTE->doesNotNeedToSchedule() &&

          is_contained(UseEI.UserTE->Scalars, TEInsertPt))

        continue;

      // Check if the user node of the TE comes after user node of TEPtr,

      // otherwise TEPtr depends on TE.

      if ((TEInsertBlock != InsertPt->getParent() ||

           TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&

          !CheckOrdering(InsertPt))

        continue;

      // The node is reused - exit.

      if (CheckAndUseSameNode(TEPtr))

        break;

      VToTEs.insert(TEPtr);

    }

    if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {

      const auto *It = find_if(

          VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });

      if (It != VTEs.end()) {

        const TreeEntry *VTE = *It;

        if (none_of(TE->CombinedEntriesWithIndices,

                    [&](const auto &P) { return P.first == VTE->Idx; })) {

          Instruction &LastBundleInst = getLastInstructionInBundle(VTE);

          if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))

            continue;

        }

        // The node is reused - exit.

        if (CheckAndUseSameNode(VTE))

          break;

        VToTEs.insert(VTE);

      }

    }

    if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {

      const TreeEntry *VTE = VTEs.front();

      if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&

          VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {

        VTEs = VTEs.drop_front();

        // Iterate through all vectorized nodes.

        const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {

          return MTE->State == TreeEntry::Vectorize;

        });

        if (MIt == VTEs.end())

          continue;

        VTE = *MIt;

      }

      if (none_of(TE->CombinedEntriesWithIndices,

                  [&](const auto &P) { return P.first == VTE->Idx; })) {

        Instruction &LastBundleInst = getLastInstructionInBundle(VTE);

        if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))

          continue;

      }

      // The node is reused - exit.

      if (CheckAndUseSameNode(VTE))

        break;

      VToTEs.insert(VTE);

    }

    if (VToTEs.empty())

      continue;

    if (UsedTEs.empty()) {

      // The first iteration, just insert the list of nodes to vector.

      UsedTEs.push_back(VToTEs);

      UsedValuesEntry.try_emplace(V, 0);

    } else {

      // Need to check if there are any previously used tree nodes which use V.

      // If there are no such nodes, consider that we have another one input

      // vector.

      SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);

      unsigned Idx = 0;

      for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {

        // Do we have a non-empty intersection of previously listed tree entries

        // and tree entries using current V?

        set_intersect(VToTEs, Set);

        if (!VToTEs.empty()) {

          // Yes, write the new subset and continue analysis for the next

          // scalar.

          Set.swap(VToTEs);

          break;

        }

        VToTEs = SavedVToTEs;

        ++Idx;

      }

      // No non-empty intersection found - need to add a second set of possible

      // source vectors.

      if (Idx == UsedTEs.size()) {

        // If the number of input vectors is greater than 2 - not a permutation,

        // fallback to the regular gather.

        // TODO: support multiple reshuffled nodes.

        if (UsedTEs.size() == 2)

          continue;

        UsedTEs.push_back(SavedVToTEs);

        Idx = UsedTEs.size() - 1;

      }

      UsedValuesEntry.try_emplace(V, Idx);

    }

  }


  if (UsedTEs.empty()) {

    Entries.clear();

    return std::nullopt;

  }


  unsigned VF = 0;

  if (UsedTEs.size() == 1) {

    // Keep the order to avoid non-determinism.

    SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),

                                                UsedTEs.front().end());

    sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {

      return TE1->Idx < TE2->Idx;

    });

    // Try to find the perfect match in another gather node at first.

    auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {

      return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);

    });

    if (It != FirstEntries.end() &&

        ((*It)->getVectorFactor() == VL.size() ||

         ((*It)->getVectorFactor() == TE->Scalars.size() &&

          TE->ReuseShuffleIndices.size() == VL.size() &&

          (*It)->isSame(TE->Scalars)))) {

      Entries.push_back(*It);

      if ((*It)->getVectorFactor() == VL.size()) {

        std::iota(std::next(Mask.begin(), Part * VL.size()),

                  std::next(Mask.begin(), (Part + 1) * VL.size()), 0);

      } else {

        SmallVector<int> CommonMask = TE->getCommonMask();

        copy(CommonMask, Mask.begin());

      }

      // Clear undef scalars.

      for (unsigned I : seq<unsigned>(VL.size()))

        if (isa<PoisonValue>(VL[I]))

          Mask[Part * VL.size() + I] = PoisonMaskElem;

      return TargetTransformInfo::SK_PermuteSingleSrc;

    }

    // No perfect match, just shuffle, so choose the first tree node from the

    // tree.

    Entries.push_back(FirstEntries.front());

    // Update mapping between values and corresponding tree entries.

    for (auto &P : UsedValuesEntry)

      P.second = 0;

    VF = FirstEntries.front()->getVectorFactor();

  } else {

    // Try to find nodes with the same vector factor.

    assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");

    // Keep the order of tree nodes to avoid non-determinism.

    DenseMap<int, const TreeEntry *> VFToTE;

    for (const TreeEntry *TE : UsedTEs.front()) {

      unsigned VF = TE->getVectorFactor();

      auto It = VFToTE.find(VF);

      if (It != VFToTE.end()) {

        if (It->second->Idx > TE->Idx)

          It->getSecond() = TE;

        continue;

      }

      VFToTE.try_emplace(VF, TE);

    }

    // Same, keep the order to avoid non-determinism.

    SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),

                                                 UsedTEs.back().end());

    sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {

      return TE1->Idx < TE2->Idx;

    });

    for (const TreeEntry *TE : SecondEntries) {

      auto It = VFToTE.find(TE->getVectorFactor());

      if (It != VFToTE.end()) {

        VF = It->first;

        Entries.push_back(It->second);

        Entries.push_back(TE);

        break;

      }

    }

    // No 2 source vectors with the same vector factor - just choose 2 with max

    // index.

    if (Entries.empty()) {

      Entries.push_back(*llvm::max_element(

          UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {

            return TE1->Idx < TE2->Idx;

          }));

      Entries.push_back(SecondEntries.front());

      VF = std::max(Entries.front()->getVectorFactor(),

                    Entries.back()->getVectorFactor());

    } else {

      VF = Entries.front()->getVectorFactor();

    }

    SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;

    for (const TreeEntry *E : Entries)

      ValuesToEntries.emplace_back().insert(E->Scalars.begin(),

                                            E->Scalars.end());

    // Update mapping between values and corresponding tree entries.

    for (auto &P : UsedValuesEntry) {

      for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))

        if (ValuesToEntries[Idx].contains(P.first)) {

          P.second = Idx;

          break;

        }

    }

  }


  bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);

  // Checks if the 2 PHIs are compatible in terms of high possibility to be

  // vectorized.

  auto AreCompatiblePHIs = [&](Value *V, Value *V1) {

    auto *PHI = cast<PHINode>(V);

    auto *PHI1 = cast<PHINode>(V1);

    // Check that all incoming values are compatible/from same parent (if they

    // are instructions).

    // The incoming values are compatible if they all are constants, or

    // instruction with the same/alternate opcodes from the same basic block.

    for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {

      Value *In = PHI->getIncomingValue(I);

      Value *In1 = PHI1->getIncomingValue(I);

      if (isConstant(In) && isConstant(In1))

        continue;

      if (!getSameOpcode({In, In1}, *TLI))

        return false;

      if (cast<Instruction>(In)->getParent() !=

          cast<Instruction>(In1)->getParent())

        return false;

    }

    return true;

  };

  // Check if the value can be ignored during analysis for shuffled gathers.

  // We suppose it is better to ignore instruction, which do not form splats,

  // are not vectorized/not extractelements (these instructions will be handled

  // by extractelements processing) or may form vector node in future.

  auto MightBeIgnored = [=](Value *V) {

    auto *I = dyn_cast<Instruction>(V);

    return I && !IsSplatOrUndefs && !isVectorized(I) &&

           !isVectorLikeInstWithConstOps(I) &&

           !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);

  };

  // Check that the neighbor instruction may form a full vector node with the

  // current instruction V. It is possible, if they have same/alternate opcode

  // and same parent basic block.

  auto NeighborMightBeIgnored = [&](Value *V, int Idx) {

    Value *V1 = VL[Idx];

    bool UsedInSameVTE = false;

    auto It = UsedValuesEntry.find(V1);

    if (It != UsedValuesEntry.end())

      UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;

    return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&

           getSameOpcode({V, V1}, *TLI) &&

           cast<Instruction>(V)->getParent() ==

               cast<Instruction>(V1)->getParent() &&

           (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));

  };

  // Build a shuffle mask for better cost estimation and vector emission.

  SmallBitVector UsedIdxs(Entries.size());

  SmallVector<std::pair<unsigned, int>> EntryLanes;

  for (int I = 0, E = VL.size(); I < E; ++I) {

    Value *V = VL[I];

    auto It = UsedValuesEntry.find(V);

    if (It == UsedValuesEntry.end())

      continue;

    // Do not try to shuffle scalars, if they are constants, or instructions

    // that can be vectorized as a result of the following vector build

    // vectorization.

    if (isConstant(V) || (MightBeIgnored(V) &&

                          ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||

                           (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))

      continue;

    unsigned Idx = It->second;

    EntryLanes.emplace_back(Idx, I);

    UsedIdxs.set(Idx);

  }

  // Iterate through all shuffled scalars and select entries, which can be used

  // for final shuffle.

  SmallVector<const TreeEntry *> TempEntries;

  for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {

    if (!UsedIdxs.test(I))

      continue;

    // Fix the entry number for the given scalar. If it is the first entry, set

    // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).

    // These indices are used when calculating final shuffle mask as the vector

    // offset.

    for (std::pair<unsigned, int> &Pair : EntryLanes)

      if (Pair.first == I)

        Pair.first = TempEntries.size();

    TempEntries.push_back(Entries[I]);

  }

  Entries.swap(TempEntries);

  if (EntryLanes.size() == Entries.size() &&

      !VL.equals(ArrayRef(TE->Scalars)

                     .slice(Part * VL.size(),

                            std::min<int>(VL.size(), TE->Scalars.size())))) {

    // We may have here 1 or 2 entries only. If the number of scalars is equal

    // to the number of entries, no need to do the analysis, it is not very

    // profitable. Since VL is not the same as TE->Scalars, it means we already

    // have some shuffles before. Cut off not profitable case.

    Entries.clear();

    return std::nullopt;

  }

  // Build the final mask, check for the identity shuffle, if possible.

  bool IsIdentity = Entries.size() == 1;

  // Pair.first is the offset to the vector, while Pair.second is the index of

  // scalar in the list.

  for (const std::pair<unsigned, int> &Pair : EntryLanes) {

    unsigned Idx = Part * VL.size() + Pair.second;

    Mask[Idx] =

        Pair.first * VF +

        (ForOrder ? std::distance(

                        Entries[Pair.first]->Scalars.begin(),

                        find(Entries[Pair.first]->Scalars, VL[Pair.second]))

                  : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));

    IsIdentity &= Mask[Idx] == Pair.second;

  }

  if (ForOrder || IsIdentity || Entries.empty()) {

    switch (Entries.size()) {

    case 1:

      if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)

        return TargetTransformInfo::SK_PermuteSingleSrc;

      break;

    case 2:

      if (EntryLanes.size() > 2 || VL.size() <= 2)

        return TargetTransformInfo::SK_PermuteTwoSrc;

      break;

    default:

      break;

    }

  } else if (!isa<VectorType>(VL.front()->getType()) &&

             (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {

    // Do the cost estimation if shuffle beneficial than buildvector.

    SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),

                             std::next(Mask.begin(), (Part + 1) * VL.size()));

    int MinElement = SubMask.front(), MaxElement = SubMask.front();

    for (int Idx : SubMask) {

      if (Idx == PoisonMaskElem)

        continue;

      if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)

        MinElement = Idx;

      if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)

        MaxElement = Idx;

    }

    assert(MaxElement >= 0 && MinElement >= 0 &&

           MaxElement % VF >= MinElement % VF &&

           "Expected at least single element.");

    unsigned NewVF = std::max<unsigned>(

        VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),

                                                 (MaxElement % VF) -

                                                     (MinElement % VF) + 1));

    if (NewVF < VF) {

      for (int &Idx : SubMask) {

        if (Idx == PoisonMaskElem)

          continue;

        Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +

              (Idx >= static_cast<int>(VF) ? NewVF : 0);

      }

    } else {

      NewVF = VF;

    }


    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

    auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);

    auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());

    auto GetShuffleCost = [&,

                           &TTI = *TTI](ArrayRef<int> Mask,

                                        ArrayRef<const TreeEntry *> Entries,

                                        VectorType *VecTy) -> InstructionCost {

      if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&

          ShuffleVectorInst::isDeInterleaveMaskOfFactor(

              Mask, Entries.front()->getInterleaveFactor()))

        return TTI::TCC_Free;

      return ::getShuffleCost(TTI,

                              Entries.size() > 1 ? TTI::SK_PermuteTwoSrc

                                                 : TTI::SK_PermuteSingleSrc,

                              VecTy, Mask, CostKind);

    };

    InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);

    InstructionCost FirstShuffleCost = 0;

    SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());

    if (Entries.size() == 1 || !Entries[0]->isGather()) {

      FirstShuffleCost = ShuffleCost;

    } else {

      // Transform mask to include only first entry.

      APInt DemandedElts = APInt::getAllOnes(SubMask.size());

      bool IsIdentity = true;

      for (auto [I, Idx] : enumerate(FirstMask)) {

        if (Idx >= static_cast<int>(NewVF)) {

          Idx = PoisonMaskElem;

        } else {

          DemandedElts.clearBit(I);

          if (Idx != PoisonMaskElem)

            IsIdentity &= static_cast<int>(I) == Idx;

        }

      }

      if (!IsIdentity)

        FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);

      FirstShuffleCost += getScalarizationOverhead(

          *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,

          /*Extract=*/false, CostKind);

    }

    InstructionCost SecondShuffleCost = 0;

    SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());

    if (Entries.size() == 1 || !Entries[1]->isGather()) {

      SecondShuffleCost = ShuffleCost;

    } else {

      // Transform mask to include only first entry.

      APInt DemandedElts = APInt::getAllOnes(SubMask.size());

      bool IsIdentity = true;

      for (auto [I, Idx] : enumerate(SecondMask)) {

        if (Idx < static_cast<int>(NewVF) && Idx >= 0) {

          Idx = PoisonMaskElem;

        } else {

          DemandedElts.clearBit(I);

          if (Idx != PoisonMaskElem) {

            Idx -= NewVF;

            IsIdentity &= static_cast<int>(I) == Idx;

          }

        }

      }

      if (!IsIdentity)

        SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);

      SecondShuffleCost += getScalarizationOverhead(

          *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,

          /*Extract=*/false, CostKind);

    }

    APInt DemandedElts = APInt::getAllOnes(SubMask.size());

    for (auto [I, Idx] : enumerate(SubMask))

      if (Idx == PoisonMaskElem)

        DemandedElts.clearBit(I);

    InstructionCost BuildVectorCost = getScalarizationOverhead(

        *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,

        /*Extract=*/false, CostKind);

    const TreeEntry *BestEntry = nullptr;

    if (FirstShuffleCost < ShuffleCost) {

      std::for_each(std::next(Mask.begin(), Part * VL.size()),

                    std::next(Mask.begin(), (Part + 1) * VL.size()),

                    [&](int &Idx) {

                      if (Idx >= static_cast<int>(VF))

                        Idx = PoisonMaskElem;

                    });

      BestEntry = Entries.front();

      ShuffleCost = FirstShuffleCost;

    }

    if (SecondShuffleCost < ShuffleCost) {

      std::for_each(std::next(Mask.begin(), Part * VL.size()),

                    std::next(Mask.begin(), (Part + 1) * VL.size()),

                    [&](int &Idx) {

                      if (Idx < static_cast<int>(VF))

                        Idx = PoisonMaskElem;

                      else

                        Idx -= VF;

                    });

      BestEntry = Entries[1];

      ShuffleCost = SecondShuffleCost;

    }

    if (BuildVectorCost >= ShuffleCost) {

      if (BestEntry) {

        Entries.clear();

        Entries.push_back(BestEntry);

      }

      return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc

                                : TargetTransformInfo::SK_PermuteSingleSrc;

    }

  }

  Entries.clear();

  // Clear the corresponding mask elements.

  std::fill(std::next(Mask.begin(), Part * VL.size()),

            std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);

  return std::nullopt;

}


SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

BoUpSLP::isGatherShuffledEntry(

    const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,

    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,

    bool ForOrder) {

  assert(NumParts > 0 && NumParts < VL.size() &&

         "Expected positive number of registers.");

  Entries.clear();

  // No need to check for the topmost gather node.

  if (TE == VectorizableTree.front().get() &&

      (!GatheredLoadsEntriesFirst.has_value() ||

       none_of(ArrayRef(VectorizableTree).drop_front(),

               [](const std::unique_ptr<TreeEntry> &TE) {

                 return !TE->isGather();

               })))

    return {};

  // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not

  // implemented yet.

  if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))

    return {};

  Mask.assign(VL.size(), PoisonMaskElem);

  assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&

         "Expected only single user of the gather node.");

  assert(VL.size() % NumParts == 0 &&

         "Number of scalars must be divisible by NumParts.");

  if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&

      TE->UserTreeIndex.EdgeIdx == UINT_MAX &&

      (TE->Idx == 0 ||

       (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||

       isSplat(TE->Scalars) ||

       (TE->hasState() &&

        getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))

    return {};

  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);

  SmallVector<std::optional<TTI::ShuffleKind>> Res;

  for (unsigned Part : seq<unsigned>(NumParts)) {

    ArrayRef<Value *> SubVL =

        VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));

    SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();

    std::optional<TTI::ShuffleKind> SubRes =

        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,

                                            ForOrder);

    if (!SubRes)

      SubEntries.clear();

    Res.push_back(SubRes);

    if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&

        SubEntries.front()->getVectorFactor() == VL.size() &&

        (SubEntries.front()->isSame(TE->Scalars) ||

         SubEntries.front()->isSame(VL))) {

      SmallVector<const TreeEntry *> LocalSubEntries;

      LocalSubEntries.swap(SubEntries);

      Entries.clear();

      Res.clear();

      std::iota(Mask.begin(), Mask.end(), 0);

      // Clear undef scalars.

      for (int I = 0, Sz = VL.size(); I < Sz; ++I)

        if (isa<PoisonValue>(VL[I]))

          Mask[I] = PoisonMaskElem;

      Entries.emplace_back(1, LocalSubEntries.front());

      Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);

      return Res;

    }

  }

  if (all_of(Res,

             [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {

    Entries.clear();

    return {};

  }

  return Res;

}


InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,

                                       Type *ScalarTy) const {

  const unsigned VF = VL.size();

  auto *VecTy = getWidenedType(ScalarTy, VF);

  // Find the cost of inserting/extracting values from the vector.

  // Check if the same elements are inserted several times and count them as

  // shuffle candidates.

  APInt DemandedElements = APInt::getZero(VF);

  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

  InstructionCost Cost;

  auto EstimateInsertCost = [&](unsigned I, Value *V) {

    DemandedElements.setBit(I);

    if (V->getType() != ScalarTy)

      Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),

                                    TTI::CastContextHint::None, CostKind);

  };

  SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);

  std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);

  for (auto [I, V] : enumerate(VL)) {

    // No need to shuffle duplicates for constants.

    if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))

      continue;


    if (isConstant(V)) {

      ConstantShuffleMask[I] = I + VF;

      continue;

    }

    EstimateInsertCost(I, V);

  }

  // FIXME: add a cost for constant vector materialization.

  bool IsAnyNonUndefConst =

      any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });

  // 1. Shuffle input source vector and constant vector.

  if (!ForPoisonSrc && IsAnyNonUndefConst) {

    Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, VecTy,

                             ConstantShuffleMask);

  }


  // 2. Insert unique non-constants.

  if (!DemandedElements.isZero())

    Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,

                                     /*Insert=*/true,

                                     /*Extract=*/false, CostKind,

                                     ForPoisonSrc && !IsAnyNonUndefConst, VL);

  return Cost;

}


Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

  auto It = EntryToLastInstruction.find(E);

  if (It != EntryToLastInstruction.end())

    return *cast<Instruction>(It->second);

  Instruction *Res = nullptr;

  // Get the basic block this bundle is in. All instructions in the bundle

  // should be in this block (except for extractelement-like instructions with

  // constant indices or gathered loads or copyables).

  Instruction *Front;

  unsigned Opcode;

  if (E->hasState()) {

    Front = E->getMainOp();

    Opcode = E->getOpcode();

  } else {

    Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));

    Opcode = Front->getOpcode();

  }

  auto *BB = Front->getParent();

  assert(

      ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&

        E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||

       E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||

       all_of(E->Scalars,

              [=](Value *V) -> bool {

                if (Opcode == Instruction::GetElementPtr &&

                    !isa<GetElementPtrInst>(V))

                  return true;

                auto *I = dyn_cast<Instruction>(V);

                return !I || !E->getMatchingMainOpOrAltOp(I) ||

                       I->getParent() == BB || isVectorLikeInstWithConstOps(I);

              })) &&

      "Expected gathered loads or GEPs or instructions from same basic "

      "block.");


  auto FindLastInst = [&]() {

    Instruction *LastInst = Front;

    for (Value *V : E->Scalars) {

      auto *I = dyn_cast<Instruction>(V);

      if (!I)

        continue;

      if (E->isCopyableElement(I))

        continue;

      if (LastInst->getParent() == I->getParent()) {

        if (LastInst->comesBefore(I))

          LastInst = I;

        continue;

      }

      assert(((Opcode == Instruction::GetElementPtr &&

               !isa<GetElementPtrInst>(I)) ||

              E->State == TreeEntry::SplitVectorize ||

              (isVectorLikeInstWithConstOps(LastInst) &&

               isVectorLikeInstWithConstOps(I)) ||

              (GatheredLoadsEntriesFirst.has_value() &&

               Opcode == Instruction::Load && E->isGather() &&

               E->Idx < *GatheredLoadsEntriesFirst)) &&

             "Expected vector-like or non-GEP in GEP node insts only.");

      if (!DT->isReachableFromEntry(LastInst->getParent())) {

        LastInst = I;

        continue;

      }

      if (!DT->isReachableFromEntry(I->getParent()))

        continue;

      auto *NodeA = DT->getNode(LastInst->getParent());

      auto *NodeB = DT->getNode(I->getParent());

      assert(NodeA && "Should only process reachable instructions");

      assert(NodeB && "Should only process reachable instructions");

      assert((NodeA == NodeB) ==

                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

             "Different nodes should have different DFS numbers");

      if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())

        LastInst = I;

    }

    BB = LastInst->getParent();

    return LastInst;

  };


  auto FindFirstInst = [&]() {

    Instruction *FirstInst = Front;

    for (Value *V : E->Scalars) {

      auto *I = dyn_cast<Instruction>(V);

      if (!I)

        continue;

      if (E->isCopyableElement(I))

        continue;

      if (FirstInst->getParent() == I->getParent()) {

        if (I->comesBefore(FirstInst))

          FirstInst = I;

        continue;

      }

      assert(((Opcode == Instruction::GetElementPtr &&

               !isa<GetElementPtrInst>(I)) ||

              (isVectorLikeInstWithConstOps(FirstInst) &&

               isVectorLikeInstWithConstOps(I))) &&

             "Expected vector-like or non-GEP in GEP node insts only.");

      if (!DT->isReachableFromEntry(FirstInst->getParent())) {

        FirstInst = I;

        continue;

      }

      if (!DT->isReachableFromEntry(I->getParent()))

        continue;

      auto *NodeA = DT->getNode(FirstInst->getParent());

      auto *NodeB = DT->getNode(I->getParent());

      assert(NodeA && "Should only process reachable instructions");

      assert(NodeB && "Should only process reachable instructions");

      assert((NodeA == NodeB) ==

                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

             "Different nodes should have different DFS numbers");

      if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())

        FirstInst = I;

    }

    return FirstInst;

  };


  if (E->State == TreeEntry::SplitVectorize) {

    Res = FindLastInst();

    if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {

      for (auto *E : Entries) {

        auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);

        if (!I)

          I = &getLastInstructionInBundle(E);

        if (Res->getParent() == I->getParent() && Res->comesBefore(I))

          Res = I;

      }

    }

    EntryToLastInstruction.try_emplace(E, Res);

    return *Res;

  }


  // Set insertpoint for gathered loads to the very first load.

  if (GatheredLoadsEntriesFirst.has_value() &&

      E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&

      Opcode == Instruction::Load) {

    Res = FindFirstInst();

    EntryToLastInstruction.try_emplace(E, Res);

    return *Res;

  }


  // Set the insert point to the beginning of the basic block if the entry

  // should not be scheduled.

  auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {

    if (E->isGather())

      return nullptr;

    // Found previously that the instruction do not need to be scheduled.

    const auto *It = BlocksSchedules.find(BB);

    if (It == BlocksSchedules.end())

      return nullptr;

    for (Value *V : E->Scalars) {

      auto *I = dyn_cast<Instruction>(V);

      if (!I || isa<PHINode>(I) ||

          (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))

        continue;

      ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);

      if (Bundles.empty())

        continue;

      const auto *It = find_if(

          Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });

      if (It != Bundles.end())

        return *It;

    }

    return nullptr;

  };

  const ScheduleBundle *Bundle = FindScheduleBundle(E);

  if (!E->isGather() && !Bundle) {

    if ((Opcode == Instruction::GetElementPtr &&

         any_of(E->Scalars,

                [](Value *V) {

                  return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);

                })) ||

        all_of(E->Scalars, [&](Value *V) {

          return isa<PoisonValue>(V) || E->isCopyableElement(V) ||

                 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));

        }))

      Res = FindLastInst();

    else

      Res = FindFirstInst();

    EntryToLastInstruction.try_emplace(E, Res);

    return *Res;

  }


  // Find the last instruction. The common case should be that BB has been

  // scheduled, and the last instruction is VL.back(). So we start with

  // VL.back() and iterate over schedule data until we reach the end of the

  // bundle. The end of the bundle is marked by null ScheduleData.

  if (Bundle) {

    assert(!E->isGather() && "Gathered instructions should not be scheduled");

    Res = Bundle->getBundle().back()->getInst();

    EntryToLastInstruction.try_emplace(E, Res);

    return *Res;

  }


  // LastInst can still be null at this point if there's either not an entry

  // for BB in BlocksSchedules or there's no ScheduleData available for

  // VL.back(). This can be the case if buildTreeRec aborts for various

  // reasons (e.g., the maximum recursion depth is reached, the maximum region

  // size is reached, etc.). ScheduleData is initialized in the scheduling

  // "dry-run".

  //

  // If this happens, we can still find the last instruction by brute force. We

  // iterate forwards from Front (inclusive) until we either see all

  // instructions in the bundle or reach the end of the block. If Front is the

  // last instruction in program order, LastInst will be set to Front, and we

  // will visit all the remaining instructions in the block.

  //

  // One of the reasons we exit early from buildTreeRec is to place an upper

  // bound on compile-time. Thus, taking an additional compile-time hit here is

  // not ideal. However, this should be exceedingly rare since it requires that

  // we both exit early from buildTreeRec and that the bundle be out-of-order

  // (causing us to iterate all the way to the end of the block).

  if (!Res)

    Res = FindLastInst();

  assert(Res && "Failed to find last instruction in bundle");

  EntryToLastInstruction.try_emplace(E, Res);

  return *Res;

}


void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

  auto *Front = E->getMainOp();

  Instruction *LastInst = &getLastInstructionInBundle(E);

  assert(LastInst && "Failed to find last instruction in bundle");

  BasicBlock::iterator LastInstIt = LastInst->getIterator();

  // If the instruction is PHI, set the insert point after all the PHIs.

  bool IsPHI = isa<PHINode>(LastInst);

  if (IsPHI) {

    LastInstIt = LastInst->getParent()->getFirstNonPHIIt();

    if (LastInstIt != LastInst->getParent()->end() &&

        LastInstIt->getParent()->isLandingPad())

      LastInstIt = std::next(LastInstIt);

  }

  if (IsPHI ||

      (!E->isGather() && E->State != TreeEntry::SplitVectorize &&

       E->doesNotNeedToSchedule()) ||

      (GatheredLoadsEntriesFirst.has_value() &&

       E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&

       E->getOpcode() == Instruction::Load)) {

    Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);

  } else {

    // Set the insertion point after the last instruction in the bundle. Set the

    // debug location to Front.

    Builder.SetInsertPoint(

        LastInst->getParent(),

        LastInst->getNextNode()->getIterator());

  }

  Builder.SetCurrentDebugLocation(Front->getDebugLoc());

}


Value *BoUpSLP::gather(

    ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,

    function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {

  // List of instructions/lanes from current block and/or the blocks which are

  // part of the current loop. These instructions will be inserted at the end to

  // make it possible to optimize loops and hoist invariant instructions out of

  // the loops body with better chances for success.

  SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;

  SmallSet<int, 4> PostponedIndices;

  Loop *L = LI->getLoopFor(Builder.GetInsertBlock());

  auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {

    SmallPtrSet<BasicBlock *, 4> Visited;

    while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)

      InsertBB = InsertBB->getSinglePredecessor();

    return InsertBB && InsertBB == InstBB;

  };

  for (int I = 0, E = VL.size(); I < E; ++I) {

    if (auto *Inst = dyn_cast<Instruction>(VL[I]))

      if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||

           isVectorized(Inst) ||

           (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&

          PostponedIndices.insert(I).second)

        PostponedInsts.emplace_back(Inst, I);

  }


  auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,

                                      Type *Ty) {

    Value *Scalar = V;

    if (Scalar->getType() != Ty) {

      assert(Scalar->getType()->isIntOrIntVectorTy() &&

             Ty->isIntOrIntVectorTy() && "Expected integer types only.");

      Value *V = Scalar;

      if (auto *CI = dyn_cast<CastInst>(Scalar);

          isa_and_nonnull<SExtInst, ZExtInst>(CI)) {

        Value *Op = CI->getOperand(0);

        if (auto *IOp = dyn_cast<Instruction>(Op);

            !IOp || !(isDeleted(IOp) || isVectorized(IOp)))

          V = Op;

      }

      Scalar = Builder.CreateIntCast(

          V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));

    }


    Instruction *InsElt;

    if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {

      assert(SLPReVec && "FixedVectorType is not expected.");

      Vec =

          createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));

      auto *II = dyn_cast<Instruction>(Vec);

      if (!II)

        return Vec;

      InsElt = II;

    } else {

      Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));

      InsElt = dyn_cast<InsertElementInst>(Vec);

      if (!InsElt)

        return Vec;

    }

    GatherShuffleExtractSeq.insert(InsElt);

    CSEBlocks.insert(InsElt->getParent());

    // Add to our 'need-to-extract' list.

    if (isa<Instruction>(V)) {

      if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {

        // Find which lane we need to extract.

        User *UserOp = nullptr;

        if (Scalar != V) {

          if (auto *SI = dyn_cast<Instruction>(Scalar))

            UserOp = SI;

        } else {

          if (V->getType()->isVectorTy()) {

            if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);

                SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {

              // Find shufflevector, caused by resize.

              auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {

                if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {

                  if (SV->getOperand(0) == V)

                    return SV;

                  if (SV->getOperand(1) == V)

                    return SV;

                }

                return nullptr;

              };

              InsElt = nullptr;

              if (Instruction *User = FindOperand(SV->getOperand(0), V))

                InsElt = User;

              else if (Instruction *User = FindOperand(SV->getOperand(1), V))

                InsElt = User;

              assert(InsElt &&

                     "Failed to find shufflevector, caused by resize.");

            }

          }

          UserOp = InsElt;

        }

        if (UserOp) {

          unsigned FoundLane = Entries.front()->findLaneForValue(V);

          ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);

        }

      }

    }

    return Vec;

  };

  auto *VecTy = getWidenedType(ScalarTy, VL.size());

  Value *Vec = PoisonValue::get(VecTy);

  SmallVector<int> NonConsts;

  SmallVector<int> Mask(VL.size());

  std::iota(Mask.begin(), Mask.end(), 0);

  Value *OriginalRoot = Root;

  if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);

      SV && isa<PoisonValue>(SV->getOperand(1)) &&

      SV->getOperand(0)->getType() == VecTy) {

    Root = SV->getOperand(0);

    Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());

  }

  // Insert constant values at first.

  for (int I = 0, E = VL.size(); I < E; ++I) {

    if (PostponedIndices.contains(I))

      continue;

    if (!isConstant(VL[I])) {

      NonConsts.push_back(I);

      continue;

    }

    if (isa<PoisonValue>(VL[I]))

      continue;

    Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);

    Mask[I] = I + E;

  }

  if (Root) {

    if (isa<PoisonValue>(Vec)) {

      Vec = OriginalRoot;

    } else {

      Vec = CreateShuffle(Root, Vec, Mask);

      if (auto *OI = dyn_cast<Instruction>(OriginalRoot);

          OI && OI->use_empty() &&

          none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

            return TE->VectorizedValue == OI;

          }))

        eraseInstruction(OI);

    }

  }

  // Insert non-constant values.

  for (int I : NonConsts)

    Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);

  // Append instructions, which are/may be part of the loop, in the end to make

  // it possible to hoist non-loop-based instructions.

  for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)

    Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);


  return Vec;

}


/// Merges shuffle masks and emits final shuffle instruction, if required. It

/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,

/// when the actual shuffle instruction is generated only if this is actually

/// required. Otherwise, the shuffle instruction emission is delayed till the

/// end of the process, to reduce the number of emitted instructions and further

/// analysis/transformations.

/// The class also will look through the previously emitted shuffle instructions

/// and properly mark indices in mask as undef.

/// For example, given the code

/// \code

/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>

/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>

/// \endcode

/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will

/// look through %s1 and %s2 and emit

/// \code

/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>

/// \endcode

/// instead.

/// If 2 operands are of different size, the smallest one will be resized and

/// the mask recalculated properly.

/// For example, given the code

/// \code

/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>

/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>

/// \endcode

/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will

/// look through %s1 and %s2 and emit

/// \code

/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>

/// \endcode

/// instead.

class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {

  bool IsFinalized = false;

  /// Combined mask for all applied operands and masks. It is built during

  /// analysis and actual emission of shuffle vector instructions.

  SmallVector<int> CommonMask;

  /// List of operands for the shuffle vector instruction. It hold at max 2

  /// operands, if the 3rd is going to be added, the first 2 are combined into

  /// shuffle with \p CommonMask mask, the first operand sets to be the

  /// resulting shuffle and the second operand sets to be the newly added

  /// operand. The \p CommonMask is transformed in the proper way after that.

  SmallVector<Value *, 2> InVectors;

  IRBuilderBase &Builder;

  BoUpSLP &R;


  class ShuffleIRBuilder {

    IRBuilderBase &Builder;

    /// Holds all of the instructions that we gathered.

    SetVector<Instruction *> &GatherShuffleExtractSeq;

    /// A list of blocks that we are going to CSE.

    DenseSet<BasicBlock *> &CSEBlocks;

    /// Data layout.

    const DataLayout &DL;


  public:

    ShuffleIRBuilder(IRBuilderBase &Builder,

                     SetVector<Instruction *> &GatherShuffleExtractSeq,

                     DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)

        : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),

          CSEBlocks(CSEBlocks), DL(DL) {}

    ~ShuffleIRBuilder() = default;

    /// Creates shufflevector for the 2 operands with the given mask.

    Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {

      if (V1->getType() != V2->getType()) {

        assert(V1->getType()->isIntOrIntVectorTy() &&

               V1->getType()->isIntOrIntVectorTy() &&

               "Expected integer vector types only.");

        if (V1->getType() != V2->getType()) {

          if (cast<VectorType>(V2->getType())

                  ->getElementType()

                  ->getIntegerBitWidth() < cast<VectorType>(V1->getType())

                                               ->getElementType()

                                               ->getIntegerBitWidth())

            V2 = Builder.CreateIntCast(

                V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));

          else

            V1 = Builder.CreateIntCast(

                V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));

        }

      }

      Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);

      if (auto *I = dyn_cast<Instruction>(Vec)) {

        GatherShuffleExtractSeq.insert(I);

        CSEBlocks.insert(I->getParent());

      }

      return Vec;

    }

    /// Creates permutation of the single vector operand with the given mask, if

    /// it is not identity mask.

    Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {

      if (Mask.empty())

        return V1;

      unsigned VF = Mask.size();

      unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();

      if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))

        return V1;

      Value *Vec = Builder.CreateShuffleVector(V1, Mask);

      if (auto *I = dyn_cast<Instruction>(Vec)) {

        GatherShuffleExtractSeq.insert(I);

        CSEBlocks.insert(I->getParent());

      }

      return Vec;

    }

    Value *createIdentity(Value *V) { return V; }

    Value *createPoison(Type *Ty, unsigned VF) {

      return PoisonValue::get(getWidenedType(Ty, VF));

    }

    /// Resizes 2 input vector to match the sizes, if the they are not equal

    /// yet. The smallest vector is resized to the size of the larger vector.

    void resizeToMatch(Value *&V1, Value *&V2) {

      if (V1->getType() == V2->getType())

        return;

      int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();

      int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();

      int VF = std::max(V1VF, V2VF);

      int MinVF = std::min(V1VF, V2VF);

      SmallVector<int> IdentityMask(VF, PoisonMaskElem);

      std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),

                0);

      Value *&Op = MinVF == V1VF ? V1 : V2;

      Op = Builder.CreateShuffleVector(Op, IdentityMask);

      if (auto *I = dyn_cast<Instruction>(Op)) {

        GatherShuffleExtractSeq.insert(I);

        CSEBlocks.insert(I->getParent());

      }

      if (MinVF == V1VF)

        V1 = Op;

      else

        V2 = Op;

    }

  };


  /// Smart shuffle instruction emission, walks through shuffles trees and

  /// tries to find the best matching vector for the actual shuffle

  /// instruction.

  Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {

    assert(V1 && "Expected at least one vector value.");

    ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,

                                    R.CSEBlocks, *R.DL);

    return BaseShuffleAnalysis::createShuffle<Value *>(

        V1, V2, Mask, ShuffleBuilder, ScalarTy);

  }


  /// Cast value \p V to the vector type with the same number of elements, but

  /// the base type \p ScalarTy.

  Value *castToScalarTyElem(Value *V,

                            std::optional<bool> IsSigned = std::nullopt) {

    auto *VecTy = cast<VectorType>(V->getType());

    assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);

    if (VecTy->getElementType() == ScalarTy->getScalarType())

      return V;

    return Builder.CreateIntCast(

        V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),

        IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));

  }


public:

  ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)

      : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}


  /// Adjusts extractelements after reusing them.

  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,

                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

                        unsigned NumParts, bool &UseVecBaseAsInput) {

    UseVecBaseAsInput = false;

    SmallPtrSet<Value *, 4> UniqueBases;

    Value *VecBase = nullptr;

    SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());

    if (!E->ReorderIndices.empty()) {

      SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

                                   E->ReorderIndices.end());

      reorderScalars(VL, ReorderMask);

    }

    for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

      int Idx = Mask[I];

      if (Idx == PoisonMaskElem)

        continue;

      auto *EI = cast<ExtractElementInst>(VL[I]);

      VecBase = EI->getVectorOperand();

      if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())

        VecBase = TEs.front()->VectorizedValue;

      assert(VecBase && "Expected vectorized value.");

      UniqueBases.insert(VecBase);

      // If the only one use is vectorized - can delete the extractelement

      // itself.

      if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||

          (NumParts != 1 && count(VL, EI) > 1) ||

          any_of(EI->users(), [&](User *U) {

            ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);

            return UTEs.empty() || UTEs.size() > 1 ||

                   (isa<GetElementPtrInst>(U) &&

                    !R.areAllUsersVectorized(cast<Instruction>(U))) ||

                   (!UTEs.empty() &&

                    count_if(R.VectorizableTree,

                             [&](const std::unique_ptr<TreeEntry> &TE) {

                               return TE->UserTreeIndex.UserTE ==

                                          UTEs.front() &&

                                      is_contained(VL, EI);

                             }) != 1);

          }))

        continue;

      R.eraseInstruction(EI);

    }

    if (NumParts == 1 || UniqueBases.size() == 1) {

      assert(VecBase && "Expected vectorized value.");

      return castToScalarTyElem(VecBase);

    }

    UseVecBaseAsInput = true;

    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {

      for (auto [I, Idx] : enumerate(Mask))

        if (Idx != PoisonMaskElem)

          Idx = I;

    };

    // Perform multi-register vector shuffle, joining them into a single virtual

    // long vector.

    // Need to shuffle each part independently and then insert all this parts

    // into a long virtual vector register, forming the original vector.

    Value *Vec = nullptr;

    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);

    unsigned SliceSize = getPartNumElems(VL.size(), NumParts);

    for (unsigned Part : seq<unsigned>(NumParts)) {

      unsigned Limit = getNumElems(VL.size(), SliceSize, Part);

      ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);

      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);

      constexpr int MaxBases = 2;

      SmallVector<Value *, MaxBases> Bases(MaxBases);

      auto VLMask = zip(SubVL, SubMask);

      const unsigned VF = std::accumulate(

          VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {

            if (std::get<1>(D) == PoisonMaskElem)

              return S;

            Value *VecOp =

                cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();

            if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);

                !TEs.empty())

              VecOp = TEs.front()->VectorizedValue;

            assert(VecOp && "Expected vectorized value.");

            const unsigned Size =

                cast<FixedVectorType>(VecOp->getType())->getNumElements();

            return std::max(S, Size);

          });

      for (const auto [V, I] : VLMask) {

        if (I == PoisonMaskElem)

          continue;

        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();

        if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())

          VecOp = TEs.front()->VectorizedValue;

        assert(VecOp && "Expected vectorized value.");

        VecOp = castToScalarTyElem(VecOp);

        Bases[I / VF] = VecOp;

      }

      if (!Bases.front())

        continue;

      Value *SubVec;

      if (Bases.back()) {

        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);

        TransformToIdentity(SubMask);

      } else {

        SubVec = Bases.front();

      }

      if (!Vec) {

        Vec = SubVec;

        assert((Part == 0 || all_of(seq<unsigned>(0, Part),

                                    [&](unsigned P) {

                                      ArrayRef<int> SubMask =

                                          Mask.slice(P * SliceSize,

                                                     getNumElems(Mask.size(),

                                                                 SliceSize, P));

                                      return all_of(SubMask, [](int Idx) {

                                        return Idx == PoisonMaskElem;

                                      });

                                    })) &&

               "Expected first part or all previous parts masked.");

        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

      } else {

        unsigned NewVF =

            cast<FixedVectorType>(Vec->getType())->getNumElements();

        if (Vec->getType() != SubVec->getType()) {

          unsigned SubVecVF =

              cast<FixedVectorType>(SubVec->getType())->getNumElements();

          NewVF = std::max(NewVF, SubVecVF);

        }

        // Adjust SubMask.

        for (int &Idx : SubMask)

          if (Idx != PoisonMaskElem)

            Idx += NewVF;

        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

        Vec = createShuffle(Vec, SubVec, VecMask);

        TransformToIdentity(VecMask);

      }

    }

    copy(VecMask, Mask.begin());

    return Vec;

  }

  /// Checks if the specified entry \p E needs to be delayed because of its

  /// dependency nodes.

  std::optional<Value *>

  needToDelay(const TreeEntry *E,

              ArrayRef<SmallVector<const TreeEntry *>> Deps) const {

    // No need to delay emission if all deps are ready.

    if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {

          return all_of(

              TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });

        }))

      return std::nullopt;

    // Postpone gather emission, will be emitted after the end of the

    // process to keep correct order.

    auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());

    return Builder.CreateAlignedLoad(

        ResVecTy,

        PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),

        MaybeAlign());

  }

  /// Reset the builder to handle perfect diamond match.

  void resetForSameNode() {

    IsFinalized = false;

    CommonMask.clear();

    InVectors.clear();

  }

  /// Adds 2 input vectors (in form of tree entries) and the mask for their

  /// shuffling.

  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {

    Value *V1 = E1.VectorizedValue;

    if (V1->getType()->isIntOrIntVectorTy())

      V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {

                                if (isa<PoisonValue>(V))

                                  return false;

                                return !isKnownNonNegative(

                                    V, SimplifyQuery(*R.DL));

                              }));

    Value *V2 = E2.VectorizedValue;

    if (V2->getType()->isIntOrIntVectorTy())

      V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {

                                if (isa<PoisonValue>(V))

                                  return false;

                                return !isKnownNonNegative(

                                    V, SimplifyQuery(*R.DL));

                              }));

    add(V1, V2, Mask);

  }

  /// Adds single input vector (in form of tree entry) and the mask for its

  /// shuffling.

  void add(const TreeEntry &E1, ArrayRef<int> Mask) {

    Value *V1 = E1.VectorizedValue;

    if (V1->getType()->isIntOrIntVectorTy())

      V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {

                                if (isa<PoisonValue>(V))

                                  return false;

                                return !isKnownNonNegative(

                                    V, SimplifyQuery(*R.DL));

                              }));

    add(V1, Mask);

  }

  /// Adds 2 input vectors and the mask for their shuffling.

  void add(Value *V1, Value *V2, ArrayRef<int> Mask) {

    assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");

    assert(isa<FixedVectorType>(V1->getType()) &&

           isa<FixedVectorType>(V2->getType()) &&

           "castToScalarTyElem expects V1 and V2 to be FixedVectorType");

    V1 = castToScalarTyElem(V1);

    V2 = castToScalarTyElem(V2);

    if (InVectors.empty()) {

      InVectors.push_back(V1);

      InVectors.push_back(V2);

      CommonMask.assign(Mask.begin(), Mask.end());

      return;

    }

    Value *Vec = InVectors.front();

    if (InVectors.size() == 2) {

      Vec = createShuffle(Vec, InVectors.back(), CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

    } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=

               Mask.size()) {

      Vec = createShuffle(Vec, nullptr, CommonMask);

      transformMaskAfterShuffle(CommonMask, CommonMask);

    }

    V1 = createShuffle(V1, V2, Mask);

    unsigned VF = std::max(getVF(V1), getVF(Vec));

    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

      if (Mask[Idx] != PoisonMaskElem)

        CommonMask[Idx] = Idx + VF;

    InVectors.front() = Vec;

    if (InVectors.size() == 2)

      InVectors.back() = V1;

    else

      InVectors.push_back(V1);

  }

  /// Adds another one input vector and the mask for the shuffling.

  void add(Value *V1, ArrayRef<int> Mask, bool = false) {

    assert(isa<FixedVectorType>(V1->getType()) &&

           "castToScalarTyElem expects V1 to be FixedVectorType");

    V1 = castToScalarTyElem(V1);

    if (InVectors.empty()) {

      InVectors.push_back(V1);

      CommonMask.assign(Mask.begin(), Mask.end());

      return;

    }

    const auto *It = find(InVectors, V1);

    if (It == InVectors.end()) {

      if (InVectors.size() == 2 ||

          InVectors.front()->getType() != V1->getType()) {

        Value *V = InVectors.front();

        if (InVectors.size() == 2) {

          V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);

          transformMaskAfterShuffle(CommonMask, CommonMask);

        } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=

                   CommonMask.size()) {

          V = createShuffle(InVectors.front(), nullptr, CommonMask);

          transformMaskAfterShuffle(CommonMask, CommonMask);

        }

        unsigned VF = std::max(CommonMask.size(), Mask.size());

        for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

          if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)

            CommonMask[Idx] = V->getType() != V1->getType()

                                  ? Idx + VF

                                  : Mask[Idx] + getVF(V1);

        if (V->getType() != V1->getType())

          V1 = createShuffle(V1, nullptr, Mask);

        InVectors.front() = V;

        if (InVectors.size() == 2)

          InVectors.back() = V1;

        else

          InVectors.push_back(V1);

        return;

      }

      // Check if second vector is required if the used elements are already

      // used from the first one.

      for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

        if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {

          InVectors.push_back(V1);

          break;

        }

    }

    unsigned VF = 0;

    for (Value *V : InVectors)

      VF = std::max(VF, getVF(V));

    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)

      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)

        CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);

  }

  /// Adds another one input vector and the mask for the shuffling.

  void addOrdered(Value *V1, ArrayRef<unsigned> Order) {

    SmallVector<int> NewMask;

    inversePermutation(Order, NewMask);

    add(V1, NewMask);

  }

  Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,

                Value *Root = nullptr) {

    return R.gather(VL, Root, ScalarTy,

                    [&](Value *V1, Value *V2, ArrayRef<int> Mask) {

                      return createShuffle(V1, V2, Mask);

                    });

  }

  Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }

  /// Finalize emission of the shuffles.

  /// \param Action the action (if any) to be performed before final applying of

  /// the \p ExtMask mask.

  Value *finalize(

      ArrayRef<int> ExtMask,

      ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,

      ArrayRef<int> SubVectorsMask, unsigned VF = 0,

      function_ref<void(Value *&, SmallVectorImpl<int> &,

                        function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>

          Action = {}) {

    IsFinalized = true;

    if (Action) {

      Value *Vec = InVectors.front();

      if (InVectors.size() == 2) {

        Vec = createShuffle(Vec, InVectors.back(), CommonMask);

        InVectors.pop_back();

      } else {

        Vec = createShuffle(Vec, nullptr, CommonMask);

      }

      transformMaskAfterShuffle(CommonMask, CommonMask);

      assert(VF > 0 &&

             "Expected vector length for the final value before action.");

      unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

      if (VecVF < VF) {

        SmallVector<int> ResizeMask(VF, PoisonMaskElem);

        std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);

        Vec = createShuffle(Vec, nullptr, ResizeMask);

      }

      Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {

        return createShuffle(V1, V2, Mask);

      });

      InVectors.front() = Vec;

    }

    if (!SubVectors.empty()) {

      Value *Vec = InVectors.front();

      if (InVectors.size() == 2) {

        Vec = createShuffle(Vec, InVectors.back(), CommonMask);

        InVectors.pop_back();

      } else {

        Vec = createShuffle(Vec, nullptr, CommonMask);

      }

      transformMaskAfterShuffle(CommonMask, CommonMask);

      auto CreateSubVectors = [&](Value *Vec,

                                  SmallVectorImpl<int> &CommonMask) {

        for (auto [E, Idx] : SubVectors) {

          Value *V = E->VectorizedValue;

          if (V->getType()->isIntOrIntVectorTy())

            V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {

                                     if (isa<PoisonValue>(V))

                                       return false;

                                     return !isKnownNonNegative(

                                         V, SimplifyQuery(*R.DL));

                                   }));

          unsigned InsertionIndex = Idx * getNumElements(ScalarTy);

          // Use scalar version of the SCalarType to correctly handle shuffles

          // for revectorization. The revectorization mode operates by the

          // vectors, but here we need to operate on the scalars, because the

          // masks were already transformed for the vector elements and we don't

          // need doing this transformation again.

          Type *OrigScalarTy = ScalarTy;

          ScalarTy = ScalarTy->getScalarType();

          Vec = createInsertVector(

              Builder, Vec, V, InsertionIndex,

              std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,

                        _3));

          ScalarTy = OrigScalarTy;

          if (!CommonMask.empty()) {

            std::iota(std::next(CommonMask.begin(), Idx),

                      std::next(CommonMask.begin(), Idx + E->getVectorFactor()),

                      Idx);

          }

        }

        return Vec;

      };

      if (SubVectorsMask.empty()) {

        Vec = CreateSubVectors(Vec, CommonMask);

      } else {

        SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);

        copy(SubVectorsMask, SVMask.begin());

        for (auto [I1, I2] : zip(SVMask, CommonMask)) {

          if (I2 != PoisonMaskElem) {

            assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");

            I1 = I2 + CommonMask.size();

          }

        }

        Value *InsertVec =

            CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);

        Vec = createShuffle(InsertVec, Vec, SVMask);

        transformMaskAfterShuffle(CommonMask, SVMask);

      }

      InVectors.front() = Vec;

    }


    if (!ExtMask.empty()) {

      if (CommonMask.empty()) {

        CommonMask.assign(ExtMask.begin(), ExtMask.end());

      } else {

        SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);

        for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {

          if (ExtMask[I] == PoisonMaskElem)

            continue;

          NewMask[I] = CommonMask[ExtMask[I]];

        }

        CommonMask.swap(NewMask);

      }

    }

    if (CommonMask.empty()) {

      assert(InVectors.size() == 1 && "Expected only one vector with no mask");

      return InVectors.front();

    }

    if (InVectors.size() == 2)

      return createShuffle(InVectors.front(), InVectors.back(), CommonMask);

    return createShuffle(InVectors.front(), nullptr, CommonMask);

  }


  ~ShuffleInstructionBuilder() {

    assert((IsFinalized || CommonMask.empty()) &&

           "Shuffle construction must be finalized.");

  }

};


Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {

  return vectorizeTree(getOperandEntry(E, NodeIdx));

}


template <typename BVTy, typename ResTy, typename... Args>

ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,

                                  Args &...Params) {

  assert(E->isGather() && "Expected gather node.");

  unsigned VF = E->getVectorFactor();


  bool NeedFreeze = false;

  SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

  // Clear values, to be replaced by insertvector instructions.

  for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)

    for_each(MutableArrayRef(GatheredScalars)

                 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),

             [&](Value *&V) { V = PoisonValue::get(V->getType()); });

  SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

      E->CombinedEntriesWithIndices.size());

  transform(E->CombinedEntriesWithIndices, SubVectors.begin(),

            [&](const auto &P) {

              return std::make_pair(VectorizableTree[P.first].get(), P.second);

            });

  // Build a mask out of the reorder indices and reorder scalars per this

  // mask.

  SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

                               E->ReorderIndices.end());

  if (!ReorderMask.empty())

    reorderScalars(GatheredScalars, ReorderMask);

  SmallVector<int> SubVectorsMask;

  inversePermutation(E->ReorderIndices, SubVectorsMask);

  // Transform non-clustered elements in the mask to poison (-1).

  // "Clustered" operations will be reordered using this mask later.

  if (!SubVectors.empty() && !SubVectorsMask.empty()) {

    for (unsigned I : seq<unsigned>(GatheredScalars.size()))

      if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])

        SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;

  } else {

    SubVectorsMask.clear();

  }

  SmallVector<Value *> StoredGS(GatheredScalars);

  auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,

                             unsigned I, unsigned SliceSize,

                             bool IsNotPoisonous) {

    if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {

          return isa<UndefValue>(V) && !isa<PoisonValue>(V);

        }))

      return false;

    TreeEntry *UserTE = E->UserTreeIndex.UserTE;

    unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;

    if (UserTE->getNumOperands() != 2)

      return false;

    if (!IsNotPoisonous) {

      auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),

                         [=](const std::unique_ptr<TreeEntry> &TE) {

                           return TE->UserTreeIndex.UserTE == UserTE &&

                                  TE->UserTreeIndex.EdgeIdx != EdgeIdx;

                         });

      if (It == VectorizableTree.end())

        return false;

      SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());

      if (!(*It)->ReorderIndices.empty()) {

        inversePermutation((*It)->ReorderIndices, ReorderMask);

        reorderScalars(GS, ReorderMask);

      }

      if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {

            Value *V0 = std::get<0>(P);

            Value *V1 = std::get<1>(P);

            return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||

                   (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&

                    is_contained(E->Scalars, V1));

          }))

        return false;

    }

    int Idx;

    if ((Mask.size() < InputVF &&

         ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&

         Idx == 0) ||

        (Mask.size() == InputVF &&

         ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {

      std::iota(

          std::next(Mask.begin(), I * SliceSize),

          std::next(Mask.begin(),

                    I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),

          0);

    } else {

      unsigned IVal =

          *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });

      std::fill(

          std::next(Mask.begin(), I * SliceSize),

          std::next(Mask.begin(),

                    I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),

          IVal);

    }

    return true;

  };

  BVTy ShuffleBuilder(ScalarTy, Params...);

  ResTy Res = ResTy();

  SmallVector<int> Mask;

  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);

  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;

  Value *ExtractVecBase = nullptr;

  bool UseVecBaseAsInput = false;

  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;

  SmallVector<SmallVector<const TreeEntry *>> Entries;

  Type *OrigScalarTy = GatheredScalars.front()->getType();

  auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());

  unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());

  if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {

    // Check for gathered extracts.

    bool Resized = false;

    ExtractShuffles =

        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

    if (!ExtractShuffles.empty()) {

      SmallVector<const TreeEntry *> ExtractEntries;

      for (auto [Idx, I] : enumerate(ExtractMask)) {

        if (I == PoisonMaskElem)

          continue;

        if (ArrayRef<TreeEntry *> TEs = getTreeEntries(

                cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());

            !TEs.empty())

          ExtractEntries.append(TEs.begin(), TEs.end());

      }

      if (std::optional<ResTy> Delayed =

              ShuffleBuilder.needToDelay(E, ExtractEntries)) {

        // Delay emission of gathers which are not ready yet.

        PostponedGathers.insert(E);

        // Postpone gather emission, will be emitted after the end of the

        // process to keep correct order.

        return *Delayed;

      }

      if (Value *VecBase = ShuffleBuilder.adjustExtracts(

              E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {

        ExtractVecBase = VecBase;

        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

          if (VF == VecBaseTy->getNumElements() &&

              GatheredScalars.size() != VF) {

            Resized = true;

            GatheredScalars.append(VF - GatheredScalars.size(),

                                   PoisonValue::get(OrigScalarTy));

            NumParts =

                ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);

          }

      }

    }

    // Gather extracts after we check for full matched gathers only.

    if (!ExtractShuffles.empty() || !E->hasState() ||

        E->getOpcode() != Instruction::Load ||

        (((E->hasState() && E->getOpcode() == Instruction::Load) ||

          any_of(E->Scalars, IsaPred<LoadInst>)) &&

         any_of(E->Scalars,

                [this](Value *V) {

                  return isa<LoadInst>(V) && isVectorized(V);

                })) ||

        (E->hasState() && E->isAltShuffle()) ||

        all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||

        isSplat(E->Scalars) ||

        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {

      GatherShuffles =

          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);

    }

    if (!GatherShuffles.empty()) {

      if (std::optional<ResTy> Delayed =

              ShuffleBuilder.needToDelay(E, Entries)) {

        // Delay emission of gathers which are not ready yet.

        PostponedGathers.insert(E);

        // Postpone gather emission, will be emitted after the end of the

        // process to keep correct order.

        return *Delayed;

      }

      if (GatherShuffles.size() == 1 &&

          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&

          Entries.front().front()->isSame(E->Scalars)) {

        // Perfect match in the graph, will reuse the previously vectorized

        // node. Cost is 0.

        LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "

                          << shortBundleName(E->Scalars, E->Idx) << ".\n");

        // Restore the mask for previous partially matched values.

        Mask.resize(E->Scalars.size());

        const TreeEntry *FrontTE = Entries.front().front();

        if (FrontTE->ReorderIndices.empty() &&

            ((FrontTE->ReuseShuffleIndices.empty() &&

              E->Scalars.size() == FrontTE->Scalars.size()) ||

             (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {

          std::iota(Mask.begin(), Mask.end(), 0);

        } else {

          for (auto [I, V] : enumerate(E->Scalars)) {

            if (isa<PoisonValue>(V)) {

              Mask[I] = PoisonMaskElem;

              continue;

            }

            Mask[I] = FrontTE->findLaneForValue(V);

          }

        }

        // Reset the builder(s) to correctly handle perfect diamond matched

        // nodes.

        ShuffleBuilder.resetForSameNode();

        ShuffleBuilder.add(*FrontTE, Mask);

        // Full matched entry found, no need to insert subvectors.

        Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});

        return Res;

      }

      if (!Resized) {

        if (GatheredScalars.size() != VF &&

            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {

              return any_of(TEs, [&](const TreeEntry *TE) {

                return TE->getVectorFactor() == VF;

              });

            }))

          GatheredScalars.append(VF - GatheredScalars.size(),

                                 PoisonValue::get(OrigScalarTy));

      }

      // Remove shuffled elements from list of gathers.

      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {

        if (Mask[I] != PoisonMaskElem)

          GatheredScalars[I] = PoisonValue::get(OrigScalarTy);

      }

    }

  }

  auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,

                            SmallVectorImpl<int> &ReuseMask,

                            bool IsRootPoison) {

    // For splats with can emit broadcasts instead of gathers, so try to find

    // such sequences.

    bool IsSplat = IsRootPoison && isSplat(Scalars) &&

                   (Scalars.size() > 2 || Scalars.front() == Scalars.back());

    Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));

    SmallVector<int> UndefPos;

    DenseMap<Value *, unsigned> UniquePositions;

    // Gather unique non-const values and all constant values.

    // For repeated values, just shuffle them.

    int NumNonConsts = 0;

    int SinglePos = 0;

    for (auto [I, V] : enumerate(Scalars)) {

      if (isa<UndefValue>(V)) {

        if (!isa<PoisonValue>(V)) {

          ReuseMask[I] = I;

          UndefPos.push_back(I);

        }

        continue;

      }

      if (isConstant(V)) {

        ReuseMask[I] = I;

        continue;

      }

      ++NumNonConsts;

      SinglePos = I;

      Value *OrigV = V;

      Scalars[I] = PoisonValue::get(OrigScalarTy);

      if (IsSplat) {

        Scalars.front() = OrigV;

        ReuseMask[I] = 0;

      } else {

        const auto Res = UniquePositions.try_emplace(OrigV, I);

        Scalars[Res.first->second] = OrigV;

        ReuseMask[I] = Res.first->second;

      }

    }

    if (NumNonConsts == 1) {

      // Restore single insert element.

      if (IsSplat) {

        ReuseMask.assign(VF, PoisonMaskElem);

        std::swap(Scalars.front(), Scalars[SinglePos]);

        if (!UndefPos.empty() && UndefPos.front() == 0)

          Scalars.front() = UndefValue::get(OrigScalarTy);

      }

      ReuseMask[SinglePos] = SinglePos;

    } else if (!UndefPos.empty() && IsSplat) {

      // For undef values, try to replace them with the simple broadcast.

      // We can do it if the broadcasted value is guaranteed to be

      // non-poisonous, or by freezing the incoming scalar value first.

      auto *It = find_if(Scalars, [this, E](Value *V) {

        return !isa<UndefValue>(V) &&

               (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||

                (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {

                   // Check if the value already used in the same operation in

                   // one of the nodes already.

                   return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&

                          is_contained(E->UserTreeIndex.UserTE->Scalars,

                                       U.getUser());

                 })));

      });

      if (It != Scalars.end()) {

        // Replace undefs by the non-poisoned scalars and emit broadcast.

        int Pos = std::distance(Scalars.begin(), It);

        for (int I : UndefPos) {

          // Set the undef position to the non-poisoned scalar.

          ReuseMask[I] = Pos;

          // Replace the undef by the poison, in the mask it is replaced by

          // non-poisoned scalar already.

          if (I != Pos)

            Scalars[I] = PoisonValue::get(OrigScalarTy);

        }

      } else {

        // Replace undefs by the poisons, emit broadcast and then emit

        // freeze.

        for (int I : UndefPos) {

          ReuseMask[I] = PoisonMaskElem;

          if (isa<UndefValue>(Scalars[I]))

            Scalars[I] = PoisonValue::get(OrigScalarTy);

        }

        NeedFreeze = true;

      }

    }

  };

  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {

    bool IsNonPoisoned = true;

    bool IsUsedInExpr = true;

    Value *Vec1 = nullptr;

    if (!ExtractShuffles.empty()) {

      // Gather of extractelements can be represented as just a shuffle of

      // a single/two vectors the scalars are extracted from.

      // Find input vectors.

      Value *Vec2 = nullptr;

      for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {

        if (!Mask.empty() && Mask[I] != PoisonMaskElem)

          ExtractMask[I] = PoisonMaskElem;

      }

      if (UseVecBaseAsInput) {

        Vec1 = ExtractVecBase;

      } else {

        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {

          if (ExtractMask[I] == PoisonMaskElem)

            continue;

          if (isa<UndefValue>(StoredGS[I]))

            continue;

          auto *EI = cast<ExtractElementInst>(StoredGS[I]);

          Value *VecOp = EI->getVectorOperand();

          if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);

              !TEs.empty() && TEs.front()->VectorizedValue)

            VecOp = TEs.front()->VectorizedValue;

          if (!Vec1) {

            Vec1 = VecOp;

          } else if (Vec1 != VecOp) {

            assert((!Vec2 || Vec2 == VecOp) &&

                   "Expected only 1 or 2 vectors shuffle.");

            Vec2 = VecOp;

          }

        }

      }

      if (Vec2) {

        IsUsedInExpr = false;

        IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&

                         isGuaranteedNotToBePoison(Vec2, AC);

        ShuffleBuilder.add(Vec1, Vec2, ExtractMask);

      } else if (Vec1) {

        bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);

        IsUsedInExpr &= FindReusedSplat(

            ExtractMask,

            cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,

            ExtractMask.size(), IsNotPoisonedVec);

        ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);

        IsNonPoisoned &= IsNotPoisonedVec;

      } else {

        IsUsedInExpr = false;

        ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,

                           /*ForExtracts=*/true);

      }

    }

    if (!GatherShuffles.empty()) {

      unsigned SliceSize =

          getPartNumElems(E->Scalars.size(),

                          ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));

      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);

      for (const auto [I, TEs] : enumerate(Entries)) {

        if (TEs.empty()) {

          assert(!GatherShuffles[I] &&

                 "No shuffles with empty entries list expected.");

          continue;

        }

        assert((TEs.size() == 1 || TEs.size() == 2) &&

               "Expected shuffle of 1 or 2 entries.");

        unsigned Limit = getNumElems(Mask.size(), SliceSize, I);

        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);

        VecMask.assign(VecMask.size(), PoisonMaskElem);

        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));

        if (TEs.size() == 1) {

          bool IsNotPoisonedVec =

              TEs.front()->VectorizedValue

                  ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)

                  : true;

          IsUsedInExpr &=

              FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,

                              SliceSize, IsNotPoisonedVec);

          ShuffleBuilder.add(*TEs.front(), VecMask);

          IsNonPoisoned &= IsNotPoisonedVec;

        } else {

          IsUsedInExpr = false;

          ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);

          if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)

            IsNonPoisoned &=

                isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&

                isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);

        }

      }

    }

    // Try to figure out best way to combine values: build a shuffle and insert

    // elements or just build several shuffles.

    // Insert non-constant scalars.

    SmallVector<Value *> NonConstants(GatheredScalars);

    int EMSz = ExtractMask.size();

    int MSz = Mask.size();

    // Try to build constant vector and shuffle with it only if currently we

    // have a single permutation and more than 1 scalar constants.

    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();

    bool IsIdentityShuffle =

        ((UseVecBaseAsInput ||

          all_of(ExtractShuffles,

                 [](const std::optional<TTI::ShuffleKind> &SK) {

                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==

                          TTI::SK_PermuteSingleSrc;

                 })) &&

         none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&

         ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||

        (!GatherShuffles.empty() &&

         all_of(GatherShuffles,

                [](const std::optional<TTI::ShuffleKind> &SK) {

                  return SK.value_or(TTI::SK_PermuteTwoSrc) ==

                         TTI::SK_PermuteSingleSrc;

                }) &&

         none_of(Mask, [&](int I) { return I >= MSz; }) &&

         ShuffleVectorInst::isIdentityMask(Mask, MSz));

    bool EnoughConstsForShuffle =

        IsSingleShuffle &&

        (none_of(GatheredScalars,

                 [](Value *V) {

                   return isa<UndefValue>(V) && !isa<PoisonValue>(V);

                 }) ||

         any_of(GatheredScalars,

                [](Value *V) {

                  return isa<Constant>(V) && !isa<UndefValue>(V);

                })) &&

        (!IsIdentityShuffle ||

         (GatheredScalars.size() == 2 &&

          any_of(GatheredScalars,

                 [](Value *V) { return !isa<UndefValue>(V); })) ||

         count_if(GatheredScalars, [](Value *V) {

           return isa<Constant>(V) && !isa<PoisonValue>(V);

         }) > 1);

    // NonConstants array contains just non-constant values, GatheredScalars

    // contains only constant to build final vector and then shuffle.

    for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {

      if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))

        NonConstants[I] = PoisonValue::get(OrigScalarTy);

      else

        GatheredScalars[I] = PoisonValue::get(OrigScalarTy);

    }

    // Generate constants for final shuffle and build a mask for them.

    if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {

      SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);

      TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);

      Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());

      ShuffleBuilder.add(BV, BVMask);

    }

    if (all_of(NonConstants, [=](Value *V) {

          return isa<PoisonValue>(V) ||

                 (IsSingleShuffle && ((IsIdentityShuffle &&

                  IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));

        }))

      Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

                                    SubVectorsMask);

    else

      Res = ShuffleBuilder.finalize(

          E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),

          [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {

            bool IsSplat = isSplat(NonConstants);

            SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);

            TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);

            auto CheckIfSplatIsProfitable = [&]() {

              // Estimate the cost of splatting + shuffle and compare with

              // insert + shuffle.

              constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

              Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);

              if (isa<ExtractElementInst>(V) || isVectorized(V))

                return false;

              InstructionCost SplatCost = TTI->getVectorInstrCost(

                  Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,

                  PoisonValue::get(VecTy), V);

              SmallVector<int> NewMask(Mask.begin(), Mask.end());

              for (auto [Idx, I] : enumerate(BVMask))

                if (I != PoisonMaskElem)

                  NewMask[Idx] = Mask.size();

              SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,

                                            NewMask, CostKind);

              InstructionCost BVCost = TTI->getVectorInstrCost(

                  Instruction::InsertElement, VecTy, CostKind,

                  *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),

                  Vec, V);

              // Shuffle required?

              if (count(BVMask, PoisonMaskElem) <

                  static_cast<int>(BVMask.size() - 1)) {

                SmallVector<int> NewMask(Mask.begin(), Mask.end());

                for (auto [Idx, I] : enumerate(BVMask))

                  if (I != PoisonMaskElem)

                    NewMask[Idx] = I;

                BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,

                                           VecTy, NewMask, CostKind);

              }

              return SplatCost <= BVCost;

            };

            if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {

              for (auto [Idx, I] : enumerate(BVMask))

                if (I != PoisonMaskElem)

                  Mask[Idx] = I;

              Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);

            } else {

              Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);

              SmallVector<Value *> Values(NonConstants.size(),

                                          PoisonValue::get(ScalarTy));

              Values[0] = V;

              Value *BV = ShuffleBuilder.gather(Values, BVMask.size());

              SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);

              transform(BVMask, SplatMask.begin(), [](int I) {

                return I == PoisonMaskElem ? PoisonMaskElem : 0;

              });

              if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))

                BV = CreateShuffle(BV, nullptr, SplatMask);

              for (auto [Idx, I] : enumerate(BVMask))

                if (I != PoisonMaskElem)

                  Mask[Idx] = BVMask.size() + Idx;

              Vec = CreateShuffle(Vec, BV, Mask);

              for (auto [Idx, I] : enumerate(Mask))

                if (I != PoisonMaskElem)

                  Mask[Idx] = Idx;

            }

          });

  } else if (!allConstant(GatheredScalars)) {

    // Gather unique scalars and all constants.

    SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);

    TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);

    Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());

    ShuffleBuilder.add(BV, ReuseMask);

    Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

                                  SubVectorsMask);

  } else {

    // Gather all constants.

    SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);

    for (auto [I, V] : enumerate(GatheredScalars)) {

      if (!isa<PoisonValue>(V))

        Mask[I] = I;

    }

    Value *BV = ShuffleBuilder.gather(GatheredScalars);

    ShuffleBuilder.add(BV, Mask);

    Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

                                  SubVectorsMask);

  }


  if (NeedFreeze)

    Res = ShuffleBuilder.createFreeze(Res);

  return Res;

}


Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {

  for (auto [EIdx, _] : E->CombinedEntriesWithIndices)

    (void)vectorizeTree(VectorizableTree[EIdx].get());

  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,

                                                                Builder, *this);

}


/// \returns \p I after propagating metadata from \p VL only for instructions in

/// \p VL.

static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {

  SmallVector<Value *> Insts;

  for (Value *V : VL)

    if (isa<Instruction>(V))

      Insts.push_back(V);

  return llvm::propagateMetadata(Inst, Insts);

}


static DebugLoc getDebugLocFromPHI(PHINode &PN) {

  if (DebugLoc DL = PN.getDebugLoc())

    return DL;

  return DebugLoc::getUnknown();

}


Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

  IRBuilderBase::InsertPointGuard Guard(Builder);


  Value *V = E->Scalars.front();

  Type *ScalarTy = V->getType();

  if (!isa<CmpInst>(V))

    ScalarTy = getValueType(V);

  auto It = MinBWs.find(E);

  if (It != MinBWs.end()) {

    auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);

    ScalarTy = IntegerType::get(F->getContext(), It->second.first);

    if (VecTy)

      ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());

  }

  if (E->VectorizedValue)

    return E->VectorizedValue;

  auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());

  if (E->isGather()) {

    // Set insert point for non-reduction initial nodes.

    if (E->hasState() && E->Idx == 0 && !UserIgnoreList)

      setInsertPointAfterBundle(E);

    Value *Vec = createBuildVector(E, ScalarTy);

    E->VectorizedValue = Vec;

    return Vec;

  }

  if (E->State == TreeEntry::SplitVectorize) {

    assert(E->CombinedEntriesWithIndices.size() == 2 &&

           "Expected exactly 2 combined entries.");

    setInsertPointAfterBundle(E);

    TreeEntry &OpTE1 =

        *VectorizableTree[E->CombinedEntriesWithIndices.front().first];

    assert(OpTE1.isSame(

               ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&

           "Expected same first part of scalars.");

    Value *Op1 = vectorizeTree(&OpTE1);

    TreeEntry &OpTE2 =

        *VectorizableTree[E->CombinedEntriesWithIndices.back().first];

    assert(

        OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&

        "Expected same second part of scalars.");

    Value *Op2 = vectorizeTree(&OpTE2);

    auto GetOperandSignedness = [&](const TreeEntry *OpE) {

      bool IsSigned = false;

      auto It = MinBWs.find(OpE);

      if (It != MinBWs.end())

        IsSigned = It->second.second;

      else

        IsSigned = any_of(OpE->Scalars, [&](Value *R) {

          if (isa<PoisonValue>(V))

            return false;

          return !isKnownNonNegative(R, SimplifyQuery(*DL));

        });

      return IsSigned;

    };

    if (cast<VectorType>(Op1->getType())->getElementType() !=

        ScalarTy->getScalarType()) {

      assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");

      Op1 = Builder.CreateIntCast(

          Op1,

          getWidenedType(

              ScalarTy,

              cast<FixedVectorType>(Op1->getType())->getNumElements()),

          GetOperandSignedness(&OpTE1));

    }

    if (cast<VectorType>(Op2->getType())->getElementType() !=

        ScalarTy->getScalarType()) {

      assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");

      Op2 = Builder.CreateIntCast(

          Op2,

          getWidenedType(

              ScalarTy,

              cast<FixedVectorType>(Op2->getType())->getNumElements()),

          GetOperandSignedness(&OpTE2));

    }

    if (E->ReorderIndices.empty()) {

      SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);

      std::iota(

          Mask.begin(),

          std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),

          0);

      unsigned ScalarTyNumElements = getNumElements(ScalarTy);

      if (ScalarTyNumElements != 1) {

        assert(SLPReVec && "Only supported by REVEC.");

        transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);

      }

      Value *Vec = Builder.CreateShuffleVector(Op1, Mask);

      Vec = createInsertVector(Builder, Vec, Op2,

                               E->CombinedEntriesWithIndices.back().second *

                                   ScalarTyNumElements);

      E->VectorizedValue = Vec;

      return Vec;

    }

    unsigned CommonVF =

        std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());

    if (getNumElements(Op1->getType()) != CommonVF) {

      SmallVector<int> Mask(CommonVF, PoisonMaskElem);

      std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),

                0);

      Op1 = Builder.CreateShuffleVector(Op1, Mask);

    }

    if (getNumElements(Op2->getType()) != CommonVF) {

      SmallVector<int> Mask(CommonVF, PoisonMaskElem);

      std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),

                0);

      Op2 = Builder.CreateShuffleVector(Op2, Mask);

    }

    Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());

    E->VectorizedValue = Vec;

    return Vec;

  }


  bool IsReverseOrder =

      !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);

  auto FinalShuffle = [&](Value *V, const TreeEntry *E) {

    ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);

    if (E->getOpcode() == Instruction::Store &&

        E->State == TreeEntry::Vectorize) {

      ArrayRef<int> Mask =

          ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),

                   E->ReorderIndices.size());

      ShuffleBuilder.add(V, Mask);

    } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||

               E->State == TreeEntry::CompressVectorize) {

      ShuffleBuilder.addOrdered(V, {});

    } else {

      ShuffleBuilder.addOrdered(V, E->ReorderIndices);

    }

    SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

        E->CombinedEntriesWithIndices.size());

    transform(

        E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {

          return std::make_pair(VectorizableTree[P.first].get(), P.second);

        });

    assert(

        (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&

        "Expected either combined subnodes or reordering");

    return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});

  };


  assert(!E->isGather() && "Unhandled state");

  unsigned ShuffleOrOp =

      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

  Instruction *VL0 = E->getMainOp();

  auto GetOperandSignedness = [&](unsigned Idx) {

    const TreeEntry *OpE = getOperandEntry(E, Idx);

    bool IsSigned = false;

    auto It = MinBWs.find(OpE);

    if (It != MinBWs.end())

      IsSigned = It->second.second;

    else

      IsSigned = any_of(OpE->Scalars, [&](Value *R) {

        if (isa<PoisonValue>(V))

          return false;

        return !isKnownNonNegative(R, SimplifyQuery(*DL));

      });

    return IsSigned;

  };

  switch (ShuffleOrOp) {

    case Instruction::PHI: {

      assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||

              E != VectorizableTree.front().get() || E->UserTreeIndex) &&

             "PHI reordering is free.");

      auto *PH = cast<PHINode>(VL0);

      Builder.SetInsertPoint(PH->getParent(),

                             PH->getParent()->getFirstNonPHIIt());

      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));

      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

      Value *V = NewPhi;


      // Adjust insertion point once all PHI's have been generated.

      Builder.SetInsertPoint(PH->getParent(),

                             PH->getParent()->getFirstInsertionPt());

      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));


      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      // If phi node is fully emitted - exit.

      if (NewPhi->getNumIncomingValues() != 0)

        return NewPhi;


      // PHINodes may have multiple entries from the same block. We want to

      // visit every block once.

      SmallPtrSet<BasicBlock *, 4> VisitedBBs;


      for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {

        BasicBlock *IBB = PH->getIncomingBlock(I);


        // Stop emission if all incoming values are generated.

        if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {

          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

          return NewPhi;

        }


        if (!VisitedBBs.insert(IBB).second) {

          Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);

          NewPhi->addIncoming(VecOp, IBB);

          TreeEntry *OpTE = getOperandEntry(E, I);

          assert(!OpTE->VectorizedValue && "Expected no vectorized value.");

          OpTE->VectorizedValue = VecOp;

          continue;

        }


        Builder.SetInsertPoint(IBB->getTerminator());

        Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));

        Value *Vec = vectorizeOperand(E, I);

        if (VecTy != Vec->getType()) {

          assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||

                  MinBWs.contains(getOperandEntry(E, I))) &&

                 "Expected item in MinBWs.");

          Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));

        }

        NewPhi->addIncoming(Vec, IBB);

      }


      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&

             "Invalid number of incoming values");

      assert(E->VectorizedValue && "Expected vectorized value.");

      return E->VectorizedValue;

    }


    case Instruction::ExtractElement: {

      Value *V = E->getSingleOperand(0);

      setInsertPointAfterBundle(E);

      V = FinalShuffle(V, E);

      E->VectorizedValue = V;

      return V;

    }

    case Instruction::ExtractValue: {

      auto *LI = cast<LoadInst>(E->getSingleOperand(0));

      Builder.SetInsertPoint(LI);

      Value *Ptr = LI->getPointerOperand();

      LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());

      Value *NewV = ::propagateMetadata(V, E->Scalars);

      NewV = FinalShuffle(NewV, E);

      E->VectorizedValue = NewV;

      return NewV;

    }

    case Instruction::InsertElement: {

      assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");

      Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));

      Value *V = vectorizeOperand(E, 1);

      ArrayRef<Value *> Op = E->getOperand(1);

      Type *ScalarTy = Op.front()->getType();

      if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {

        assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");

        std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));

        assert(Res.first > 0 && "Expected item in MinBWs.");

        V = Builder.CreateIntCast(

            V,

            getWidenedType(

                ScalarTy,

                cast<FixedVectorType>(V->getType())->getNumElements()),

            Res.second);

      }


      // Create InsertVector shuffle if necessary

      auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

        return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

      }));

      const unsigned NumElts =

          cast<FixedVectorType>(FirstInsert->getType())->getNumElements();

      const unsigned NumScalars = E->Scalars.size();


      unsigned Offset = *getElementIndex(VL0);

      assert(Offset < NumElts && "Failed to find vector index offset");


      // Create shuffle to resize vector

      SmallVector<int> Mask;

      if (!E->ReorderIndices.empty()) {

        inversePermutation(E->ReorderIndices, Mask);

        Mask.append(NumElts - NumScalars, PoisonMaskElem);

      } else {

        Mask.assign(NumElts, PoisonMaskElem);

        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

      }

      // Create InsertVector shuffle if necessary

      bool IsIdentity = true;

      SmallVector<int> PrevMask(NumElts, PoisonMaskElem);

      Mask.swap(PrevMask);

      for (unsigned I = 0; I < NumScalars; ++I) {

        Value *Scalar = E->Scalars[PrevMask[I]];

        unsigned InsertIdx = *getElementIndex(Scalar);

        IsIdentity &= InsertIdx - Offset == I;

        Mask[InsertIdx - Offset] = I;

      }

      if (!IsIdentity || NumElts != NumScalars) {

        Value *V2 = nullptr;

        bool IsVNonPoisonous =

            !isConstant(V) && isGuaranteedNotToBePoison(V, AC);

        SmallVector<int> InsertMask(Mask);

        if (NumElts != NumScalars && Offset == 0) {

          // Follow all insert element instructions from the current buildvector

          // sequence.

          InsertElementInst *Ins = cast<InsertElementInst>(VL0);

          do {

            std::optional<unsigned> InsertIdx = getElementIndex(Ins);

            if (!InsertIdx)

              break;

            if (InsertMask[*InsertIdx] == PoisonMaskElem)

              InsertMask[*InsertIdx] = *InsertIdx;

            if (!Ins->hasOneUse())

              break;

            Ins = dyn_cast_or_null<InsertElementInst>(

                Ins->getUniqueUndroppableUser());

          } while (Ins);

          SmallBitVector UseMask =

              buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

          SmallBitVector IsFirstPoison =

              isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

          SmallBitVector IsFirstUndef =

              isUndefVector(FirstInsert->getOperand(0), UseMask);

          if (!IsFirstPoison.all()) {

            unsigned Idx = 0;

            for (unsigned I = 0; I < NumElts; I++) {

              if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&

                  IsFirstUndef.test(I)) {

                if (IsVNonPoisonous) {

                  InsertMask[I] = I < NumScalars ? I : 0;

                  continue;

                }

                if (!V2)

                  V2 = UndefValue::get(V->getType());

                if (Idx >= NumScalars)

                  Idx = NumScalars - 1;

                InsertMask[I] = NumScalars + Idx;

                ++Idx;

              } else if (InsertMask[I] != PoisonMaskElem &&

                         Mask[I] == PoisonMaskElem) {

                InsertMask[I] = PoisonMaskElem;

              }

            }

          } else {

            InsertMask = Mask;

          }

        }

        if (!V2)

          V2 = PoisonValue::get(V->getType());

        V = Builder.CreateShuffleVector(V, V2, InsertMask);

        if (auto *I = dyn_cast<Instruction>(V)) {

          GatherShuffleExtractSeq.insert(I);

          CSEBlocks.insert(I->getParent());

        }

      }


      SmallVector<int> InsertMask(NumElts, PoisonMaskElem);

      for (unsigned I = 0; I < NumElts; I++) {

        if (Mask[I] != PoisonMaskElem)

          InsertMask[Offset + I] = I;

      }

      SmallBitVector UseMask =

          buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

      SmallBitVector IsFirstUndef =

          isUndefVector(FirstInsert->getOperand(0), UseMask);

      if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&

          NumElts != NumScalars) {

        if (IsFirstUndef.all()) {

          if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {

            SmallBitVector IsFirstPoison =

                isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

            if (!IsFirstPoison.all()) {

              for (unsigned I = 0; I < NumElts; I++) {

                if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))

                  InsertMask[I] = I + NumElts;

              }

            }

            V = Builder.CreateShuffleVector(

                V,

                IsFirstPoison.all() ? PoisonValue::get(V->getType())

                                    : FirstInsert->getOperand(0),

                InsertMask, cast<Instruction>(E->Scalars.back())->getName());

            if (auto *I = dyn_cast<Instruction>(V)) {

              GatherShuffleExtractSeq.insert(I);

              CSEBlocks.insert(I->getParent());

            }

          }

        } else {

          SmallBitVector IsFirstPoison =

              isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

          for (unsigned I = 0; I < NumElts; I++) {

            if (InsertMask[I] == PoisonMaskElem)

              InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;

            else

              InsertMask[I] += NumElts;

          }

          V = Builder.CreateShuffleVector(

              FirstInsert->getOperand(0), V, InsertMask,

              cast<Instruction>(E->Scalars.back())->getName());

          if (auto *I = dyn_cast<Instruction>(V)) {

            GatherShuffleExtractSeq.insert(I);

            CSEBlocks.insert(I->getParent());

          }

        }

      }


      ++NumVectorInstructions;

      E->VectorizedValue = V;

      return V;

    }

    case Instruction::ZExt:

    case Instruction::SExt:

    case Instruction::FPToUI:

    case Instruction::FPToSI:

    case Instruction::FPExt:

    case Instruction::PtrToInt:

    case Instruction::IntToPtr:

    case Instruction::SIToFP:

    case Instruction::UIToFP:

    case Instruction::Trunc:

    case Instruction::FPTrunc:

    case Instruction::BitCast: {

      setInsertPointAfterBundle(E);


      Value *InVec = vectorizeOperand(E, 0);


      auto *CI = cast<CastInst>(VL0);

      Instruction::CastOps VecOpcode = CI->getOpcode();

      Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();

      auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

      if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&

          (SrcIt != MinBWs.end() || It != MinBWs.end() ||

           SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {

        // Check if the values are candidates to demote.

        unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);

        if (SrcIt != MinBWs.end())

          SrcBWSz = SrcIt->second.first;

        unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());

        if (BWSz == SrcBWSz) {

          VecOpcode = Instruction::BitCast;

        } else if (BWSz < SrcBWSz) {

          VecOpcode = Instruction::Trunc;

        } else if (It != MinBWs.end()) {

          assert(BWSz > SrcBWSz && "Invalid cast!");

          VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

        } else if (SrcIt != MinBWs.end()) {

          assert(BWSz > SrcBWSz && "Invalid cast!");

          VecOpcode =

              SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;

        }

      } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&

                 !SrcIt->second.second) {

        VecOpcode = Instruction::UIToFP;

      }

      Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)

                     ? InVec

                     : Builder.CreateCast(VecOpcode, InVec, VecTy);

      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;

      return V;

    }

    case Instruction::FCmp:

    case Instruction::ICmp: {

      setInsertPointAfterBundle(E);


      Value *L = vectorizeOperand(E, 0);

      Value *R = vectorizeOperand(E, 1);

      if (L->getType() != R->getType()) {

        assert((getOperandEntry(E, 0)->isGather() ||

                getOperandEntry(E, 1)->isGather() ||

                MinBWs.contains(getOperandEntry(E, 0)) ||

                MinBWs.contains(getOperandEntry(E, 1))) &&

               "Expected item in MinBWs.");

        if (cast<VectorType>(L->getType())

                ->getElementType()

                ->getIntegerBitWidth() < cast<VectorType>(R->getType())

                                             ->getElementType()

                                             ->getIntegerBitWidth()) {

          Type *CastTy = R->getType();

          L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));

        } else {

          Type *CastTy = L->getType();

          R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));

        }

      }


      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

      Value *V = Builder.CreateCmp(P0, L, R);

      propagateIRFlags(V, E->Scalars, VL0);

      if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())

        ICmp->setSameSign(/*B=*/false);

      // Do not cast for cmps.

      VecTy = cast<FixedVectorType>(V->getType());

      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;

      return V;

    }

    case Instruction::Select: {

      setInsertPointAfterBundle(E);


      Value *Cond = vectorizeOperand(E, 0);

      Value *True = vectorizeOperand(E, 1);

      Value *False = vectorizeOperand(E, 2);

      if (True->getType() != VecTy || False->getType() != VecTy) {

        assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||

                getOperandEntry(E, 2)->isGather() ||

                MinBWs.contains(getOperandEntry(E, 1)) ||

                MinBWs.contains(getOperandEntry(E, 2))) &&

               "Expected item in MinBWs.");

        if (True->getType() != VecTy)

          True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));

        if (False->getType() != VecTy)

          False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));

      }


      unsigned CondNumElements = getNumElements(Cond->getType());

      unsigned TrueNumElements = getNumElements(True->getType());

      assert(TrueNumElements >= CondNumElements &&

             TrueNumElements % CondNumElements == 0 &&

             "Cannot vectorize Instruction::Select");

      assert(TrueNumElements == getNumElements(False->getType()) &&

             "Cannot vectorize Instruction::Select");

      if (CondNumElements != TrueNumElements) {

        // When the return type is i1 but the source is fixed vector type, we

        // need to duplicate the condition value.

        Cond = Builder.CreateShuffleVector(

            Cond, createReplicatedMask(TrueNumElements / CondNumElements,

                                       CondNumElements));

      }

      assert(getNumElements(Cond->getType()) == TrueNumElements &&

             "Cannot vectorize Instruction::Select");

      Value *V = Builder.CreateSelect(Cond, True, False);

      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;

      return V;

    }

    case Instruction::FNeg: {

      setInsertPointAfterBundle(E);


      Value *Op = vectorizeOperand(E, 0);


      Value *V = Builder.CreateUnOp(

          static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);

      propagateIRFlags(V, E->Scalars, VL0);

      if (auto *I = dyn_cast<Instruction>(V))

        V = ::propagateMetadata(I, E->Scalars);


      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;


      return V;

    }

    case Instruction::Freeze: {

      setInsertPointAfterBundle(E);


      Value *Op = vectorizeOperand(E, 0);


      if (Op->getType() != VecTy) {

        assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||

                MinBWs.contains(getOperandEntry(E, 0))) &&

               "Expected item in MinBWs.");

        Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));

      }

      Value *V = Builder.CreateFreeze(Op);

      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;


      return V;

    }

    case Instruction::Add:

    case Instruction::FAdd:

    case Instruction::Sub:

    case Instruction::FSub:

    case Instruction::Mul:

    case Instruction::FMul:

    case Instruction::UDiv:

    case Instruction::SDiv:

    case Instruction::FDiv:

    case Instruction::URem:

    case Instruction::SRem:

    case Instruction::FRem:

    case Instruction::Shl:

    case Instruction::LShr:

    case Instruction::AShr:

    case Instruction::And:

    case Instruction::Or:

    case Instruction::Xor: {

      setInsertPointAfterBundle(E);


      Value *LHS = vectorizeOperand(E, 0);

      Value *RHS = vectorizeOperand(E, 1);

      if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {

        for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {

          ArrayRef<Value *> Ops = E->getOperand(I);

          if (all_of(Ops, [&](Value *Op) {

                auto *CI = dyn_cast<ConstantInt>(Op);

                return CI && CI->getValue().countr_one() >= It->second.first;

              })) {

            V = FinalShuffle(I == 0 ? RHS : LHS, E);

            E->VectorizedValue = V;

            ++NumVectorInstructions;

            return V;

          }

        }

      }

      if (LHS->getType() != VecTy || RHS->getType() != VecTy) {

        assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||

                getOperandEntry(E, 1)->isGather() ||

                MinBWs.contains(getOperandEntry(E, 0)) ||

                MinBWs.contains(getOperandEntry(E, 1))) &&

               "Expected item in MinBWs.");

        if (LHS->getType() != VecTy)

          LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));

        if (RHS->getType() != VecTy)

          RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));

      }


      Value *V = Builder.CreateBinOp(

          static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

          RHS);

      propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());

      if (auto *I = dyn_cast<Instruction>(V)) {

        V = ::propagateMetadata(I, E->Scalars);

        // Drop nuw flags for abs(sub(commutative), true).

        if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&

            any_of(E->Scalars, [](Value *V) {

              return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));

            }))

          I->setHasNoUnsignedWrap(/*b=*/false);

      }


      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;


      return V;

    }

    case Instruction::Load: {

      // Loads are inserted at the head of the tree because we don't want to

      // sink them all the way down past store instructions.

      setInsertPointAfterBundle(E);


      LoadInst *LI = cast<LoadInst>(VL0);

      Instruction *NewLI;

      Value *PO = LI->getPointerOperand();

      if (E->State == TreeEntry::Vectorize) {

        NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());

      } else if (E->State == TreeEntry::CompressVectorize) {

        auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =

            CompressEntryToData.at(E);

        Align CommonAlignment = LI->getAlign();

        if (IsMasked) {

          unsigned VF = getNumElements(LoadVecTy);

          SmallVector<Constant *> MaskValues(

              VF / getNumElements(LI->getType()),

              ConstantInt::getFalse(VecTy->getContext()));

          for (int I : CompressMask)

            MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());

          if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {

            assert(SLPReVec && "Only supported by REVEC.");

            MaskValues = replicateMask(MaskValues, VecTy->getNumElements());

          }

          Constant *MaskValue = ConstantVector::get(MaskValues);

          NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,

                                           MaskValue);

        } else {

          NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);

        }

        NewLI = ::propagateMetadata(NewLI, E->Scalars);

        // TODO: include this cost into CommonCost.

        if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {

          assert(SLPReVec && "FixedVectorType is not expected.");

          transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),

                                                 CompressMask);

        }

        NewLI =

            cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));

      } else if (E->State == TreeEntry::StridedVectorize) {

        Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();

        Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();

        PO = IsReverseOrder ? PtrN : Ptr0;

        std::optional<int64_t> Diff = getPointersDiff(

            VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);

        Type *StrideTy = DL->getIndexType(PO->getType());

        Value *StrideVal;

        if (Diff) {

          int64_t Stride =

              *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);

          StrideVal =

              ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *

                                             DL->getTypeAllocSize(ScalarTy));

        } else {

          SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);

          transform(E->Scalars, PointerOps.begin(), [](Value *V) {

            return cast<LoadInst>(V)->getPointerOperand();

          });

          OrdersType Order;

          std::optional<Value *> Stride =

              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,

                                &*Builder.GetInsertPoint());

          Value *NewStride =

              Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);

          StrideVal = Builder.CreateMul(

              NewStride,

              ConstantInt::get(

                  StrideTy,

                  (IsReverseOrder ? -1 : 1) *

                      static_cast<int>(DL->getTypeAllocSize(ScalarTy))));

        }

        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);

        auto *Inst = Builder.CreateIntrinsic(

            Intrinsic::experimental_vp_strided_load,

            {VecTy, PO->getType(), StrideTy},

            {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),

             Builder.getInt32(E->Scalars.size())});

        Inst->addParamAttr(

            /*ArgNo=*/0,

            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));

        NewLI = Inst;

      } else {

        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");

        Value *VecPtr = vectorizeOperand(E, 0);

        if (isa<FixedVectorType>(ScalarTy)) {

          assert(SLPReVec && "FixedVectorType is not expected.");

          // CreateMaskedGather expects VecTy and VecPtr have same size. We need

          // to expand VecPtr if ScalarTy is a vector type.

          unsigned ScalarTyNumElements =

              cast<FixedVectorType>(ScalarTy)->getNumElements();

          unsigned VecTyNumElements =

              cast<FixedVectorType>(VecTy)->getNumElements();

          assert(VecTyNumElements % ScalarTyNumElements == 0 &&

                 "Cannot expand getelementptr.");

          unsigned VF = VecTyNumElements / ScalarTyNumElements;

          SmallVector<Constant *> Indices(VecTyNumElements);

          transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {

            return Builder.getInt64(I % ScalarTyNumElements);

          });

          VecPtr = Builder.CreateGEP(

              VecTy->getElementType(),

              Builder.CreateShuffleVector(

                  VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),

              ConstantVector::get(Indices));

        }

        // Use the minimum alignment of the gathered loads.

        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);

        NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);

      }

      Value *V = E->State == TreeEntry::CompressVectorize

                     ? NewLI

                     : ::propagateMetadata(NewLI, E->Scalars);


      V = FinalShuffle(V, E);

      E->VectorizedValue = V;

      ++NumVectorInstructions;

      return V;

    }

    case Instruction::Store: {

      auto *SI = cast<StoreInst>(VL0);


      setInsertPointAfterBundle(E);


      Value *VecValue = vectorizeOperand(E, 0);

      if (VecValue->getType() != VecTy)

        VecValue =

            Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));

      VecValue = FinalShuffle(VecValue, E);


      Value *Ptr = SI->getPointerOperand();

      Instruction *ST;

      if (E->State == TreeEntry::Vectorize) {

        ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());

      } else {

        assert(E->State == TreeEntry::StridedVectorize &&

               "Expected either strided or consecutive stores.");

        if (!E->ReorderIndices.empty()) {

          SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);

          Ptr = SI->getPointerOperand();

        }

        Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);

        Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());

        auto *Inst = Builder.CreateIntrinsic(

            Intrinsic::experimental_vp_strided_store,

            {VecTy, Ptr->getType(), StrideTy},

            {VecValue, Ptr,

             ConstantInt::get(

                 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),

             Builder.getAllOnesMask(VecTy->getElementCount()),

             Builder.getInt32(E->Scalars.size())});

        Inst->addParamAttr(

            /*ArgNo=*/1,

            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));

        ST = Inst;

      }


      Value *V = ::propagateMetadata(ST, E->Scalars);


      E->VectorizedValue = V;

      ++NumVectorInstructions;

      return V;

    }

    case Instruction::GetElementPtr: {

      auto *GEP0 = cast<GetElementPtrInst>(VL0);

      setInsertPointAfterBundle(E);


      Value *Op0 = vectorizeOperand(E, 0);


      SmallVector<Value *> OpVecs;

      for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {

        Value *OpVec = vectorizeOperand(E, J);

        OpVecs.push_back(OpVec);

      }


      Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);

      if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {

        SmallVector<Value *> GEPs;

        for (Value *V : E->Scalars) {

          if (isa<GetElementPtrInst>(V))

            GEPs.push_back(V);

        }

        V = ::propagateMetadata(I, GEPs);

      }


      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;


      return V;

    }

    case Instruction::Call: {

      CallInst *CI = cast<CallInst>(VL0);

      setInsertPointAfterBundle(E);


      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);


      SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(

          CI, ID, VecTy->getNumElements(),

          It != MinBWs.end() ? It->second.first : 0, TTI);

      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);

      bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&

                          VecCallCosts.first <= VecCallCosts.second;


      Value *ScalarArg = nullptr;

      SmallVector<Value *> OpVecs;

      SmallVector<Type *, 2> TysForDecl;

      // Add return type if intrinsic is overloaded on it.

      if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))

        TysForDecl.push_back(VecTy);

      auto *CEI = cast<CallInst>(VL0);

      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {

        // Some intrinsics have scalar arguments. This argument should not be

        // vectorized.

        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {

          ScalarArg = CEI->getArgOperand(I);

          // if decided to reduce bitwidth of abs intrinsic, it second argument

          // must be set false (do not return poison, if value issigned min).

          if (ID == Intrinsic::abs && It != MinBWs.end() &&

              It->second.first < DL->getTypeSizeInBits(CEI->getType()))

            ScalarArg = Builder.getFalse();

          OpVecs.push_back(ScalarArg);

          if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))

            TysForDecl.push_back(ScalarArg->getType());

          continue;

        }


        Value *OpVec = vectorizeOperand(E, I);

        ScalarArg = CEI->getArgOperand(I);

        if (cast<VectorType>(OpVec->getType())->getElementType() !=

                ScalarArg->getType()->getScalarType() &&

            It == MinBWs.end()) {

          auto *CastTy =

              getWidenedType(ScalarArg->getType(), VecTy->getNumElements());

          OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));

        } else if (It != MinBWs.end()) {

          OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));

        }

        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");

        OpVecs.push_back(OpVec);

        if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))

          TysForDecl.push_back(OpVec->getType());

      }


      Function *CF;

      if (!UseIntrinsic) {

        VFShape Shape =

            VFShape::get(CI->getFunctionType(),

                         ElementCount::getFixed(VecTy->getNumElements()),

                         false /*HasGlobalPred*/);

        CF = VFDatabase(*CI).getVectorizedFunction(Shape);

      } else {

        CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);

      }


      SmallVector<OperandBundleDef, 1> OpBundles;

      CI->getOperandBundlesAsDefs(OpBundles);

      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);


      propagateIRFlags(V, E->Scalars, VL0);

      V = FinalShuffle(V, E);


      E->VectorizedValue = V;

      ++NumVectorInstructions;

      return V;

    }

    case Instruction::ShuffleVector: {

      Value *V;

      if (SLPReVec && !E->isAltShuffle()) {

        setInsertPointAfterBundle(E);

        Value *Src = vectorizeOperand(E, 0);

        SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));

        if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {

          SmallVector<int> NewMask(ThisMask.size());

          transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {

            return SVSrc->getShuffleMask()[Mask];

          });

          V = Builder.CreateShuffleVector(SVSrc->getOperand(0),

                                          SVSrc->getOperand(1), NewMask);

        } else {

          V = Builder.CreateShuffleVector(Src, ThisMask);

        }

        propagateIRFlags(V, E->Scalars, VL0);

        if (auto *I = dyn_cast<Instruction>(V))

          V = ::propagateMetadata(I, E->Scalars);

        V = FinalShuffle(V, E);

      } else {

        assert(E->isAltShuffle() &&

               ((Instruction::isBinaryOp(E->getOpcode()) &&

                 Instruction::isBinaryOp(E->getAltOpcode())) ||

                (Instruction::isCast(E->getOpcode()) &&

                 Instruction::isCast(E->getAltOpcode())) ||

                (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

               "Invalid Shuffle Vector Operand");


        Value *LHS = nullptr, *RHS = nullptr;

        if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {

          setInsertPointAfterBundle(E);

          LHS = vectorizeOperand(E, 0);

          RHS = vectorizeOperand(E, 1);

        } else {

          setInsertPointAfterBundle(E);

          LHS = vectorizeOperand(E, 0);

        }

        if (LHS && RHS &&

            ((Instruction::isBinaryOp(E->getOpcode()) &&

              (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||

             (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {

          assert((It != MinBWs.end() ||

                  getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||

                  getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||

                  MinBWs.contains(getOperandEntry(E, 0)) ||

                  MinBWs.contains(getOperandEntry(E, 1))) &&

                 "Expected item in MinBWs.");

          Type *CastTy = VecTy;

          if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {

            if (cast<VectorType>(LHS->getType())

                    ->getElementType()

                    ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())

                                                 ->getElementType()

                                                 ->getIntegerBitWidth())

              CastTy = RHS->getType();

            else

              CastTy = LHS->getType();

          }

          if (LHS->getType() != CastTy)

            LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));

          if (RHS->getType() != CastTy)

            RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));

        }


        Value *V0, *V1;

        if (Instruction::isBinaryOp(E->getOpcode())) {

          V0 = Builder.CreateBinOp(

              static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);

          V1 = Builder.CreateBinOp(

              static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);

        } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

          V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);

          auto *AltCI = cast<CmpInst>(E->getAltOp());

          CmpInst::Predicate AltPred = AltCI->getPredicate();

          V1 = Builder.CreateCmp(AltPred, LHS, RHS);

        } else {

          if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {

            unsigned SrcBWSz = DL->getTypeSizeInBits(

                cast<VectorType>(LHS->getType())->getElementType());

            unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);

            if (BWSz <= SrcBWSz) {

              if (BWSz < SrcBWSz)

                LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);

              assert(LHS->getType() == VecTy &&

                     "Expected same type as operand.");

              if (auto *I = dyn_cast<Instruction>(LHS))

                LHS = ::propagateMetadata(I, E->Scalars);

              LHS = FinalShuffle(LHS, E);

              E->VectorizedValue = LHS;

              ++NumVectorInstructions;

              return LHS;

            }

          }

          V0 = Builder.CreateCast(

              static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);

          V1 = Builder.CreateCast(

              static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);

        }

        // Add V0 and V1 to later analysis to try to find and remove matching

        // instruction, if any.

        for (Value *V : {V0, V1}) {

          if (auto *I = dyn_cast<Instruction>(V)) {

            GatherShuffleExtractSeq.insert(I);

            CSEBlocks.insert(I->getParent());

          }

        }


        // Create shuffle to take alternate operations from the vector.

        // Also, gather up main and alt scalar ops to propagate IR flags to

        // each vector operation.

        ValueList OpScalars, AltScalars;

        SmallVector<int> Mask;

        E->buildAltOpShuffleMask(

            [E, this](Instruction *I) {

              assert(E->getMatchingMainOpOrAltOp(I) &&

                     "Unexpected main/alternate opcode");

              return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

                                            *TLI);

            },

            Mask, &OpScalars, &AltScalars);


        propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());

        propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());

        auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {

          // Drop nuw flags for abs(sub(commutative), true).

          if (auto *I = dyn_cast<Instruction>(Vec);

              I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&

              any_of(E->Scalars, [](Value *V) {

                if (isa<PoisonValue>(V))

                  return false;

                auto *IV = cast<Instruction>(V);

                return IV->getOpcode() == Instruction::Sub && isCommutative(IV);

              }))

            I->setHasNoUnsignedWrap(/*b=*/false);

        };

        DropNuwFlag(V0, E->getOpcode());

        DropNuwFlag(V1, E->getAltOpcode());


        if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

          assert(SLPReVec && "FixedVectorType is not expected.");

          transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);

        }

        V = Builder.CreateShuffleVector(V0, V1, Mask);

        if (auto *I = dyn_cast<Instruction>(V)) {

          V = ::propagateMetadata(I, E->Scalars);

          GatherShuffleExtractSeq.insert(I);

          CSEBlocks.insert(I->getParent());

        }

      }


      E->VectorizedValue = V;

      ++NumVectorInstructions;


      return V;

    }

    default:

      llvm_unreachable("unknown inst");

  }

  return nullptr;

}


Value *BoUpSLP::vectorizeTree() {

  ExtraValueToDebugLocsMap ExternallyUsedValues;

  return vectorizeTree(ExternallyUsedValues);

}


Value *BoUpSLP::vectorizeTree(

    const ExtraValueToDebugLocsMap &ExternallyUsedValues,

    Instruction *ReductionRoot,

    ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {

  // Clean Entry-to-LastInstruction table. It can be affected after scheduling,

  // need to rebuild it.

  EntryToLastInstruction.clear();

  // All blocks must be scheduled before any instructions are inserted.

  for (auto &BSIter : BlocksSchedules)

    scheduleBlock(*this, BSIter.second.get());

  // Cache last instructions for the nodes to avoid side effects, which may

  // appear during vectorization, like extra uses, etc.

  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

    if (TE->isGather())

      continue;

    (void)getLastInstructionInBundle(TE.get());

  }


  if (ReductionRoot)

    Builder.SetInsertPoint(ReductionRoot->getParent(),

                           ReductionRoot->getIterator());

  else

    Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());


  // Vectorize gather operands of the nodes with the external uses only.

  SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;

  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

    if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&

        TE->UserTreeIndex.UserTE->hasState() &&

        TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&

        (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||

         TE->UserTreeIndex.UserTE->isAltShuffle()) &&

        !TE->UserTreeIndex.UserTE->hasCopyableElements() &&

        all_of(TE->UserTreeIndex.UserTE->Scalars,

               [](Value *V) { return isUsedOutsideBlock(V); })) {

      Instruction &LastInst =

          getLastInstructionInBundle(TE->UserTreeIndex.UserTE);

      GatherEntries.emplace_back(TE.get(), &LastInst);

    }

  }

  for (auto &Entry : GatherEntries) {

    IRBuilderBase::InsertPointGuard Guard(Builder);

    Builder.SetInsertPoint(Entry.second);

    Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());

    (void)vectorizeTree(Entry.first);

  }

  // Emit gathered loads first to emit better code for the users of those

  // gathered loads.

  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

    if (GatheredLoadsEntriesFirst.has_value() &&

        TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&

        (!TE->isGather() || TE->UserTreeIndex)) {

      assert((TE->UserTreeIndex ||

              (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&

             "Expected gathered load node.");

      (void)vectorizeTree(TE.get());

    }

  }

  (void)vectorizeTree(VectorizableTree[0].get());

  // Run through the list of postponed gathers and emit them, replacing the temp

  // emitted allocas with actual vector instructions.

  ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();

  DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;

  for (const TreeEntry *E : PostponedNodes) {

    auto *TE = const_cast<TreeEntry *>(E);

    auto *PrevVec = cast<Instruction>(TE->VectorizedValue);

    TE->VectorizedValue = nullptr;

    auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);

    // If user is a PHI node, its vector code have to be inserted right before

    // block terminator. Since the node was delayed, there were some unresolved

    // dependencies at the moment when stab instruction was emitted. In a case

    // when any of these dependencies turn out an operand of another PHI, coming

    // from this same block, position of a stab instruction will become invalid.

    // The is because source vector that supposed to feed this gather node was

    // inserted at the end of the block [after stab instruction]. So we need

    // to adjust insertion point again to the end of block.

    if (isa<PHINode>(UserI)) {

      // Insert before all users.

      Instruction *InsertPt = PrevVec->getParent()->getTerminator();

      for (User *U : PrevVec->users()) {

        if (U == UserI)

          continue;

        auto *UI = dyn_cast<Instruction>(U);

        if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())

          continue;

        if (UI->comesBefore(InsertPt))

          InsertPt = UI;

      }

      Builder.SetInsertPoint(InsertPt);

    } else {

      Builder.SetInsertPoint(PrevVec);

    }

    Builder.SetCurrentDebugLocation(UserI->getDebugLoc());

    Value *Vec = vectorizeTree(TE);

    if (auto *VecI = dyn_cast<Instruction>(Vec);

        VecI && VecI->getParent() == Builder.GetInsertBlock() &&

        Builder.GetInsertPoint()->comesBefore(VecI))

      VecI->moveBeforePreserving(*Builder.GetInsertBlock(),

                                 Builder.GetInsertPoint());

    if (Vec->getType() != PrevVec->getType()) {

      assert(Vec->getType()->isIntOrIntVectorTy() &&

             PrevVec->getType()->isIntOrIntVectorTy() &&

             "Expected integer vector types only.");

      std::optional<bool> IsSigned;

      for (Value *V : TE->Scalars) {

        if (isVectorized(V)) {

          for (const TreeEntry *MNTE : getTreeEntries(V)) {

            auto It = MinBWs.find(MNTE);

            if (It != MinBWs.end()) {

              IsSigned = IsSigned.value_or(false) || It->second.second;

              if (*IsSigned)

                break;

            }

          }

          if (IsSigned.value_or(false))

            break;

          // Scan through gather nodes.

          for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {

            auto It = MinBWs.find(BVE);

            if (It != MinBWs.end()) {

              IsSigned = IsSigned.value_or(false) || It->second.second;

              if (*IsSigned)

                break;

            }

          }

          if (IsSigned.value_or(false))

            break;

          if (auto *EE = dyn_cast<ExtractElementInst>(V)) {

            IsSigned =

                IsSigned.value_or(false) ||

                !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));

            continue;

          }

          if (IsSigned.value_or(false))

            break;

        }

      }

      if (IsSigned.value_or(false)) {

        // Final attempt - check user node.

        auto It = MinBWs.find(TE->UserTreeIndex.UserTE);

        if (It != MinBWs.end())

          IsSigned = It->second.second;

      }

      assert(IsSigned &&

             "Expected user node or perfect diamond match in MinBWs.");

      Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);

    }

    PrevVec->replaceAllUsesWith(Vec);

    PostponedValues.try_emplace(Vec).first->second.push_back(TE);

    // Replace the stub vector node, if it was used before for one of the

    // buildvector nodes already.

    auto It = PostponedValues.find(PrevVec);

    if (It != PostponedValues.end()) {

      for (TreeEntry *VTE : It->getSecond())

        VTE->VectorizedValue = Vec;

    }

    eraseInstruction(PrevVec);

  }


  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()

                    << " values .\n");


  SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;

  // Maps vector instruction to original insertelement instruction

  DenseMap<Value *, InsertElementInst *> VectorToInsertElement;

  // Maps extract Scalar to the corresponding extractelement instruction in the

  // basic block. Only one extractelement per block should be emitted.

  DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>

      ScalarToEEs;

  SmallDenseSet<Value *, 4> UsedInserts;

  DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;

  SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;

  SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;

  // Extract all of the elements with the external uses.

  for (const auto &ExternalUse : ExternalUses) {

    Value *Scalar = ExternalUse.Scalar;

    llvm::User *User = ExternalUse.User;


    // Skip users that we already RAUW. This happens when one instruction

    // has multiple uses of the same value.

    if (User && !is_contained(Scalar->users(), User))

      continue;

    const TreeEntry *E = &ExternalUse.E;

    assert(E && "Invalid scalar");

    assert(!E->isGather() && "Extracting from a gather list");

    // Non-instruction pointers are not deleted, just skip them.

    if (E->getOpcode() == Instruction::GetElementPtr &&

        !isa<GetElementPtrInst>(Scalar))

      continue;


    Value *Vec = E->VectorizedValue;

    assert(Vec && "Can't find vectorizable value");


    Value *Lane = Builder.getInt32(ExternalUse.Lane);

    auto ExtractAndExtendIfNeeded = [&](Value *Vec) {

      if (Scalar->getType() != Vec->getType()) {

        Value *Ex = nullptr;

        Value *ExV = nullptr;

        auto *Inst = dyn_cast<Instruction>(Scalar);

        bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);

        auto It = ScalarToEEs.find(Scalar);

        if (It != ScalarToEEs.end()) {

          // No need to emit many extracts, just move the only one in the

          // current block.

          auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()

                                                  : Builder.GetInsertBlock());

          if (EEIt != It->second.end()) {

            Value *PrevV = EEIt->second.first;

            if (auto *I = dyn_cast<Instruction>(PrevV);

                I && !ReplaceInst &&

                Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&

                Builder.GetInsertPoint()->comesBefore(I)) {

              I->moveBefore(*Builder.GetInsertPoint()->getParent(),

                            Builder.GetInsertPoint());

              if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))

                CI->moveAfter(I);

            }

            Ex = PrevV;

            ExV = EEIt->second.second ? EEIt->second.second : Ex;

          }

        }

        if (!Ex) {

          // "Reuse" the existing extract to improve final codegen.

          if (ReplaceInst) {

            // Leave the instruction as is, if it cheaper extracts and all

            // operands are scalar.

            if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {

              IgnoredExtracts.insert(EE);

              Ex = EE;

            } else {

              auto *CloneInst = Inst->clone();

              CloneInst->insertBefore(Inst->getIterator());

              if (Inst->hasName())

                CloneInst->takeName(Inst);

              Ex = CloneInst;

            }

          } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);

                     ES && isa<Instruction>(Vec)) {

            Value *V = ES->getVectorOperand();

            auto *IVec = cast<Instruction>(Vec);

            if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())

              V = ETEs.front()->VectorizedValue;

            if (auto *IV = dyn_cast<Instruction>(V);

                !IV || IV == Vec || IV->getParent() != IVec->getParent() ||

                IV->comesBefore(IVec))

              Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());

            else

              Ex = Builder.CreateExtractElement(Vec, Lane);

          } else if (auto *VecTy =

                         dyn_cast<FixedVectorType>(Scalar->getType())) {

            assert(SLPReVec && "FixedVectorType is not expected.");

            unsigned VecTyNumElements = VecTy->getNumElements();

            // When REVEC is enabled, we need to extract a vector.

            // Note: The element size of Scalar may be different from the

            // element size of Vec.

            Ex = createExtractVector(Builder, Vec, VecTyNumElements,

                                     ExternalUse.Lane * VecTyNumElements);

          } else {

            Ex = Builder.CreateExtractElement(Vec, Lane);

          }

          // If necessary, sign-extend or zero-extend ScalarRoot

          // to the larger type.

          ExV = Ex;

          if (Scalar->getType() != Ex->getType())

            ExV = Builder.CreateIntCast(

                Ex, Scalar->getType(),

                !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));

          auto *I = dyn_cast<Instruction>(Ex);

          ScalarToEEs[Scalar].try_emplace(I ? I->getParent()

                                            : &F->getEntryBlock(),

                                          std::make_pair(Ex, ExV));

        }

        // The then branch of the previous if may produce constants, since 0

        // operand might be a constant.

        if (auto *ExI = dyn_cast<Instruction>(Ex);

            ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {

          GatherShuffleExtractSeq.insert(ExI);

          CSEBlocks.insert(ExI->getParent());

        }

        return ExV;

      }

      assert(isa<FixedVectorType>(Scalar->getType()) &&

             isa<InsertElementInst>(Scalar) &&

             "In-tree scalar of vector type is not insertelement?");

      auto *IE = cast<InsertElementInst>(Scalar);

      VectorToInsertElement.try_emplace(Vec, IE);

      return Vec;

    };

    // If User == nullptr, the Scalar remains as scalar in vectorized

    // instructions or is used as extra arg. Generate ExtractElement instruction

    // and update the record for this scalar in ExternallyUsedValues.

    if (!User) {

      if (!ScalarsWithNullptrUser.insert(Scalar).second)

        continue;

      assert(

          (ExternallyUsedValues.count(Scalar) ||

          ExternalUsesWithNonUsers.count(Scalar) ||

           ExternalUsesAsOriginalScalar.contains(Scalar) ||

           any_of(

               Scalar->users(),

               [&, TTI = TTI](llvm::User *U) {

                 if (ExternalUsesAsOriginalScalar.contains(U))

                   return true;

                 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);

                 return !UseEntries.empty() &&

                        (E->State == TreeEntry::Vectorize ||

                         E->State == TreeEntry::StridedVectorize ||

                         E->State == TreeEntry::CompressVectorize) &&

                        any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {

                          return (UseEntry->State == TreeEntry::Vectorize ||

                                  UseEntry->State ==

                                      TreeEntry::StridedVectorize ||

                                  UseEntry->State ==

                                      TreeEntry::CompressVectorize) &&

                                 doesInTreeUserNeedToExtract(

                                     Scalar, getRootEntryInstruction(*UseEntry),

                                     TLI, TTI);

                        });

               })) &&

          "Scalar with nullptr User must be registered in "

          "ExternallyUsedValues map or remain as scalar in vectorized "

          "instructions");

      if (auto *VecI = dyn_cast<Instruction>(Vec)) {

        if (auto *PHI = dyn_cast<PHINode>(VecI)) {

          if (PHI->getParent()->isLandingPad())

            Builder.SetInsertPoint(

                PHI->getParent(),

                std::next(

                    PHI->getParent()->getLandingPadInst()->getIterator()));

          else

            Builder.SetInsertPoint(PHI->getParent(),

                                   PHI->getParent()->getFirstNonPHIIt());

        } else {

          Builder.SetInsertPoint(VecI->getParent(),

                                 std::next(VecI->getIterator()));

        }

      } else {

        Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

      }

      Value *NewInst = ExtractAndExtendIfNeeded(Vec);

      // Required to update internally referenced instructions.

      if (Scalar != NewInst) {

        assert((!isa<ExtractElementInst>(Scalar) ||

                !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&

               "Extractelements should not be replaced.");

        Scalar->replaceAllUsesWith(NewInst);

      }

      continue;

    }


    if (auto *VU = dyn_cast<InsertElementInst>(User);

        VU && VU->getOperand(1) == Scalar) {

      // Skip if the scalar is another vector op or Vec is not an instruction.

      if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {

        if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {

          if (!UsedInserts.insert(VU).second)

            continue;

          // Need to use original vector, if the root is truncated.

          auto BWIt = MinBWs.find(E);

          if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {

            auto *ScalarTy = FTy->getElementType();

            auto Key = std::make_pair(Vec, ScalarTy);

            auto VecIt = VectorCasts.find(Key);

            if (VecIt == VectorCasts.end()) {

              IRBuilderBase::InsertPointGuard Guard(Builder);

              if (auto *IVec = dyn_cast<PHINode>(Vec)) {

                if (IVec->getParent()->isLandingPad())

                  Builder.SetInsertPoint(IVec->getParent(),

                                         std::next(IVec->getParent()

                                                       ->getLandingPadInst()

                                                       ->getIterator()));

                else

                  Builder.SetInsertPoint(

                      IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());

              } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {

                Builder.SetInsertPoint(IVec->getNextNode());

              }

              Vec = Builder.CreateIntCast(

                  Vec,

                  getWidenedType(

                      ScalarTy,

                      cast<FixedVectorType>(Vec->getType())->getNumElements()),

                  BWIt->second.second);

              VectorCasts.try_emplace(Key, Vec);

            } else {

              Vec = VecIt->second;

            }

          }


          std::optional<unsigned> InsertIdx = getElementIndex(VU);

          if (InsertIdx) {

            auto *It = find_if(

                ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {

                  // Checks if 2 insertelements are from the same buildvector.

                  InsertElementInst *VecInsert = Data.InsertElements.front();

                  return areTwoInsertFromSameBuildVector(

                      VU, VecInsert,

                      [](InsertElementInst *II) { return II->getOperand(0); });

                });

            unsigned Idx = *InsertIdx;

            if (It == ShuffledInserts.end()) {

              (void)ShuffledInserts.emplace_back();

              It = std::next(ShuffledInserts.begin(),

                             ShuffledInserts.size() - 1);

            }

            SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

            if (Mask.empty())

              Mask.assign(FTy->getNumElements(), PoisonMaskElem);

            Mask[Idx] = ExternalUse.Lane;

            It->InsertElements.push_back(cast<InsertElementInst>(User));

            continue;

          }

        }

      }

    }


    // Generate extracts for out-of-tree users.

    // Find the insertion point for the extractelement lane.

    if (auto *VecI = dyn_cast<Instruction>(Vec)) {

      if (PHINode *PH = dyn_cast<PHINode>(User)) {

        for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {

          if (PH->getIncomingValue(I) == Scalar) {

            Instruction *IncomingTerminator =

                PH->getIncomingBlock(I)->getTerminator();

            if (isa<CatchSwitchInst>(IncomingTerminator)) {

              Builder.SetInsertPoint(VecI->getParent(),

                                     std::next(VecI->getIterator()));

            } else {

              Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());

            }

            Value *NewInst = ExtractAndExtendIfNeeded(Vec);

            PH->setOperand(I, NewInst);

          }

        }

      } else {

        Builder.SetInsertPoint(cast<Instruction>(User));

        Value *NewInst = ExtractAndExtendIfNeeded(Vec);

        User->replaceUsesOfWith(Scalar, NewInst);

      }

    } else {

      Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());

      Value *NewInst = ExtractAndExtendIfNeeded(Vec);

      User->replaceUsesOfWith(Scalar, NewInst);

    }


    LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");

  }


  auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {

    SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);

    SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);

    int VF = cast<FixedVectorType>(V1->getType())->getNumElements();

    for (int I = 0, E = Mask.size(); I < E; ++I) {

      if (Mask[I] < VF)

        CombinedMask1[I] = Mask[I];

      else

        CombinedMask2[I] = Mask[I] - VF;

    }

    ShuffleInstructionBuilder ShuffleBuilder(

        cast<VectorType>(V1->getType())->getElementType(), Builder, *this);

    ShuffleBuilder.add(V1, CombinedMask1);

    if (V2)

      ShuffleBuilder.add(V2, CombinedMask2);

    return ShuffleBuilder.finalize({}, {}, {});

  };


  auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,

                                       bool ForSingleMask) {

    unsigned VF = Mask.size();

    unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

    if (VF != VecVF) {

      if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {

        Vec = CreateShuffle(Vec, nullptr, Mask);

        return std::make_pair(Vec, true);

      }

      if (!ForSingleMask) {

        SmallVector<int> ResizeMask(VF, PoisonMaskElem);

        for (unsigned I = 0; I < VF; ++I) {

          if (Mask[I] != PoisonMaskElem)

            ResizeMask[Mask[I]] = Mask[I];

        }

        Vec = CreateShuffle(Vec, nullptr, ResizeMask);

      }

    }


    return std::make_pair(Vec, false);

  };

  // Perform shuffling of the vectorize tree entries for better handling of

  // external extracts.

  for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {

    // Find the first and the last instruction in the list of insertelements.

    sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);

    InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();

    InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();

    Builder.SetInsertPoint(LastInsert);

    auto Vector = ShuffledInserts[I].ValueMasks.takeVector();

    Value *NewInst = performExtractsShuffleAction<Value>(

        MutableArrayRef(Vector.data(), Vector.size()),

        FirstInsert->getOperand(0),

        [](Value *Vec) {

          return cast<VectorType>(Vec->getType())

              ->getElementCount()

              .getKnownMinValue();

        },

        ResizeToVF,

        [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,

                                      ArrayRef<Value *> Vals) {

          assert((Vals.size() == 1 || Vals.size() == 2) &&

                 "Expected exactly 1 or 2 input values.");

          if (Vals.size() == 1) {

            // Do not create shuffle if the mask is a simple identity

            // non-resizing mask.

            if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())

                                   ->getNumElements() ||

                !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

              return CreateShuffle(Vals.front(), nullptr, Mask);

            return Vals.front();

          }

          return CreateShuffle(Vals.front() ? Vals.front()

                                            : FirstInsert->getOperand(0),

                               Vals.back(), Mask);

        });

    auto It = ShuffledInserts[I].InsertElements.rbegin();

    // Rebuild buildvector chain.

    InsertElementInst *II = nullptr;

    if (It != ShuffledInserts[I].InsertElements.rend())

      II = *It;

    SmallVector<Instruction *> Inserts;

    while (It != ShuffledInserts[I].InsertElements.rend()) {

      assert(II && "Must be an insertelement instruction.");

      if (*It == II)

        ++It;

      else

        Inserts.push_back(cast<Instruction>(II));

      II = dyn_cast<InsertElementInst>(II->getOperand(0));

    }

    for (Instruction *II : reverse(Inserts)) {

      II->replaceUsesOfWith(II->getOperand(0), NewInst);

      if (auto *NewI = dyn_cast<Instruction>(NewInst))

        if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))

          II->moveAfter(NewI);

      NewInst = II;

    }

    LastInsert->replaceAllUsesWith(NewInst);

    for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {

      IE->replaceUsesOfWith(IE->getOperand(0),

                            PoisonValue::get(IE->getOperand(0)->getType()));

      IE->replaceUsesOfWith(IE->getOperand(1),

                            PoisonValue::get(IE->getOperand(1)->getType()));

      eraseInstruction(IE);

    }

    CSEBlocks.insert(LastInsert->getParent());

  }


  SmallVector<Instruction *> RemovedInsts;

  // For each vectorized value:

  for (auto &TEPtr : VectorizableTree) {

    TreeEntry *Entry = TEPtr.get();


    // No need to handle users of gathered values.

    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)

      continue;


    assert(Entry->VectorizedValue && "Can't find vectorizable value");


    // For each lane:

    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

      Value *Scalar = Entry->Scalars[Lane];


      if (Entry->getOpcode() == Instruction::GetElementPtr &&

          !isa<GetElementPtrInst>(Scalar))

        continue;

      if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);

          EE && IgnoredExtracts.contains(EE))

        continue;

      if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))

        continue;

#ifndef NDEBUG

      Type *Ty = Scalar->getType();

      if (!Ty->isVoidTy()) {

        for (User *U : Scalar->users()) {

          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");


          // It is legal to delete users in the ignorelist.

          assert((isVectorized(U) ||

                  (UserIgnoreList && UserIgnoreList->contains(U)) ||

                  (isa_and_nonnull<Instruction>(U) &&

                   isDeleted(cast<Instruction>(U)))) &&

                 "Deleting out-of-tree value");

        }

      }

#endif

      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");

      auto *I = cast<Instruction>(Scalar);

      RemovedInsts.push_back(I);

    }

  }


  // Merge the DIAssignIDs from the about-to-be-deleted instructions into the

  // new vector instruction.

  if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))

    V->mergeDIAssignID(RemovedInsts);


  // Clear up reduction references, if any.

  if (UserIgnoreList) {

    for (Instruction *I : RemovedInsts) {

      const TreeEntry *IE = getTreeEntries(I).front();

      if (IE->Idx != 0 &&

          !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&

            (ValueToGatherNodes.lookup(I).contains(

                 VectorizableTree.front().get()) ||

             (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&

              IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&

          !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&

            IE->UserTreeIndex &&

            is_contained(VectorizableTree.front()->Scalars, I)) &&

          !(GatheredLoadsEntriesFirst.has_value() &&

            IE->Idx >= *GatheredLoadsEntriesFirst &&

            VectorizableTree.front()->isGather() &&

            is_contained(VectorizableTree.front()->Scalars, I)))

        continue;

      SmallVector<SelectInst *> LogicalOpSelects;

      I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {

        // Do not replace condition of the logical op in form select <cond>.

        bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&

                                    (match(U.getUser(), m_LogicalAnd()) ||

                                     match(U.getUser(), m_LogicalOr())) &&

                                    U.getOperandNo() == 0;

        if (IsPoisoningLogicalOp) {

          LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));

          return false;

        }

        return UserIgnoreList->contains(U.getUser());

      });

      // Replace conditions of the poisoning logical ops with the non-poison

      // constant value.

      for (SelectInst *SI : LogicalOpSelects)

        SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));

    }

  }

  // Retain to-be-deleted instructions for some debug-info bookkeeping and alias

  // cache correctness.

  // NOTE: removeInstructionAndOperands only marks the instruction for deletion

  // - instructions are not deleted until later.

  removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);


  Builder.ClearInsertionPoint();

  InstrElementSize.clear();


  const TreeEntry &RootTE = *VectorizableTree.front();

  Value *Vec = RootTE.VectorizedValue;

  if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&

                                      It != MinBWs.end() &&

                                      ReductionBitWidth != It->second.first) {

    IRBuilder<>::InsertPointGuard Guard(Builder);

    Builder.SetInsertPoint(ReductionRoot->getParent(),

                           ReductionRoot->getIterator());

    Vec = Builder.CreateIntCast(

        Vec,

        VectorType::get(Builder.getIntNTy(ReductionBitWidth),

                        cast<VectorType>(Vec->getType())->getElementCount()),

        It->second.second);

  }

  return Vec;

}


void BoUpSLP::optimizeGatherSequence() {

  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()

                    << " gather sequences instructions.\n");

  // LICM InsertElementInst sequences.

  for (Instruction *I : GatherShuffleExtractSeq) {

    if (isDeleted(I))

      continue;


    // Check if this block is inside a loop.

    Loop *L = LI->getLoopFor(I->getParent());

    if (!L)

      continue;


    // Check if it has a preheader.

    BasicBlock *PreHeader = L->getLoopPreheader();

    if (!PreHeader)

      continue;


    // If the vector or the element that we insert into it are

    // instructions that are defined in this basic block then we can't

    // hoist this instruction.

    if (any_of(I->operands(), [L](Value *V) {

          auto *OpI = dyn_cast<Instruction>(V);

          return OpI && L->contains(OpI);

        }))

      continue;


    // We can hoist this instruction. Move it to the pre-header.

    I->moveBefore(PreHeader->getTerminator()->getIterator());

    CSEBlocks.insert(PreHeader);

  }


  // Make a list of all reachable blocks in our CSE queue.

  SmallVector<const DomTreeNode *, 8> CSEWorkList;

  CSEWorkList.reserve(CSEBlocks.size());

  for (BasicBlock *BB : CSEBlocks)

    if (DomTreeNode *N = DT->getNode(BB)) {

      assert(DT->isReachableFromEntry(N));

      CSEWorkList.push_back(N);

    }


  // Sort blocks by domination. This ensures we visit a block after all blocks

  // dominating it are visited.

  llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {

    assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&

           "Different nodes should have different DFS numbers");

    return A->getDFSNumIn() < B->getDFSNumIn();

  });


  // Less defined shuffles can be replaced by the more defined copies.

  // Between two shuffles one is less defined if it has the same vector operands

  // and its mask indeces are the same as in the first one or undefs. E.g.

  // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,

  // poison, <0, 0, 0, 0>.

  auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,

                                                Instruction *I2,

                                                SmallVectorImpl<int> &NewMask) {

    if (I1->getType() != I2->getType())

      return false;

    auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);

    auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);

    if (!SI1 || !SI2)

      return I1->isIdenticalTo(I2);

    if (SI1->isIdenticalTo(SI2))

      return true;

    for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)

      if (SI1->getOperand(I) != SI2->getOperand(I))

        return false;

    // Check if the second instruction is more defined than the first one.

    NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());

    ArrayRef<int> SM1 = SI1->getShuffleMask();

    // Count trailing undefs in the mask to check the final number of used

    // registers.

    unsigned LastUndefsCnt = 0;

    for (int I = 0, E = NewMask.size(); I < E; ++I) {

      if (SM1[I] == PoisonMaskElem)

        ++LastUndefsCnt;

      else

        LastUndefsCnt = 0;

      if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&

          NewMask[I] != SM1[I])

        return false;

      if (NewMask[I] == PoisonMaskElem)

        NewMask[I] = SM1[I];

    }

    // Check if the last undefs actually change the final number of used vector

    // registers.

    return SM1.size() - LastUndefsCnt > 1 &&

           ::getNumberOfParts(*TTI, SI1->getType()) ==

               ::getNumberOfParts(

                   *TTI, getWidenedType(SI1->getType()->getElementType(),

                                        SM1.size() - LastUndefsCnt));

  };

  // Perform O(N^2) search over the gather/shuffle sequences and merge identical

  // instructions. TODO: We can further optimize this scan if we split the

  // instructions into different buckets based on the insert lane.

  SmallVector<Instruction *, 16> Visited;

  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {

    assert(*I &&

           (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&

           "Worklist not sorted properly!");

    BasicBlock *BB = (*I)->getBlock();

    // For all instructions in blocks containing gather sequences:

    for (Instruction &In : llvm::make_early_inc_range(*BB)) {

      if (isDeleted(&In))

        continue;

      if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&

          !GatherShuffleExtractSeq.contains(&In))

        continue;


      // Check if we can replace this instruction with any of the

      // visited instructions.

      bool Replaced = false;

      for (Instruction *&V : Visited) {

        SmallVector<int> NewMask;

        if (IsIdenticalOrLessDefined(&In, V, NewMask) &&

            DT->dominates(V->getParent(), In.getParent())) {

          In.replaceAllUsesWith(V);

          eraseInstruction(&In);

          if (auto *SI = dyn_cast<ShuffleVectorInst>(V))

            if (!NewMask.empty())

              SI->setShuffleMask(NewMask);

          Replaced = true;

          break;

        }

        if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&

            GatherShuffleExtractSeq.contains(V) &&

            IsIdenticalOrLessDefined(V, &In, NewMask) &&

            DT->dominates(In.getParent(), V->getParent())) {

          In.moveAfter(V);

          V->replaceAllUsesWith(&In);

          eraseInstruction(V);

          if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))

            if (!NewMask.empty())

              SI->setShuffleMask(NewMask);

          V = &In;

          Replaced = true;

          break;

        }

      }

      if (!Replaced) {

        assert(!is_contained(Visited, &In));

        Visited.push_back(&In);

      }

    }

  }

  CSEBlocks.clear();

  GatherShuffleExtractSeq.clear();

}


BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(

    ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {

  auto &BundlePtr =

      ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());

  for (Value *V : VL) {

    if (S.isNonSchedulable(V))

      continue;

    auto *I = cast<Instruction>(V);

    if (S.isCopyableElement(V)) {

      // Add a copyable element model.

      ScheduleCopyableData &SD =

          addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);

      // Group the instructions to a bundle.

      BundlePtr->add(&SD);

      continue;

    }

    ScheduleData *BundleMember = getScheduleData(V);

    assert(BundleMember && "no ScheduleData for bundle member "

                           "(maybe not in same basic block)");

    // Group the instructions to a bundle.

    BundlePtr->add(BundleMember);

    ScheduledBundles.try_emplace(I).first->getSecond().push_back(

        BundlePtr.get());

  }

  assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");

  return *BundlePtr;

}


// Groups the instructions to a bundle (which is then a single scheduling entity)

// and schedules instructions until the bundle gets ready.

std::optional<BoUpSLP::ScheduleBundle *>

BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

                                            const InstructionsState &S,

                                            const EdgeInfo &EI) {

  // No need to schedule PHIs, insertelement, extractelement and extractvalue

  // instructions.

  bool HasCopyables = S.areInstructionsWithCopyableElements();

  if (isa<PHINode>(S.getMainOp()) ||

      isVectorLikeInstWithConstOps(S.getMainOp()) ||

      (!HasCopyables && doesNotNeedToSchedule(VL)) ||

      all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))

    return nullptr;


  // Initialize the instruction bundle.

  Instruction *OldScheduleEnd = ScheduleEnd;

  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");


  auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {

    // Clear deps or recalculate the region, if the memory instruction is a

    // copyable. It may have memory deps, which must be recalculated.

    SmallVector<ScheduleData *> ControlDependentMembers;

    auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {

      SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;

      for (ScheduleEntity *SE : Bundle.getBundle()) {

        if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {

          if (ScheduleData *BundleMember = getScheduleData(SD->getInst());

              BundleMember && BundleMember->hasValidDependencies()) {

            BundleMember->clearDirectDependencies();

            if (RegionHasStackSave ||

                !isGuaranteedToTransferExecutionToSuccessor(

                    BundleMember->getInst()))

              ControlDependentMembers.push_back(BundleMember);

          }

          continue;

        }

        auto *SD = cast<ScheduleData>(SE);

        for (const Use &U : SD->getInst()->operands()) {

          unsigned &NumOps =

              UserOpToNumOps

                  .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)

                  .first->getSecond();

          ++NumOps;

          if (auto *Op = dyn_cast<Instruction>(U.get());

              Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,

                                                         *SLP, NumOps)) {

            if (ScheduleData *OpSD = getScheduleData(Op)) {

              OpSD->clearDirectDependencies();

              if (RegionHasStackSave ||

                  !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))

                ControlDependentMembers.push_back(OpSD);

            }

          }

        }

      }

    };

    // The scheduling region got new instructions at the lower end (or it is a

    // new region for the first bundle). This makes it necessary to

    // recalculate all dependencies.

    // It is seldom that this needs to be done a second time after adding the

    // initial bundle to the region.

    if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {

      for_each(ScheduleDataMap, [&](auto &P) {

        if (BB != P.first->getParent())

          return;

        ScheduleData *SD = P.second;

        if (isInSchedulingRegion(*SD))

          SD->clearDependencies();

      });

      for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {

        for_each(P.second, [&](ScheduleCopyableData *SD) {

          if (isInSchedulingRegion(*SD))

            SD->clearDependencies();

        });

      });

      ReSchedule = true;

    }

    // Check if the bundle data has deps for copyable elements already. In

    // this case need to reset deps and recalculate it.

    if (Bundle && !Bundle.getBundle().empty()) {

      if (S.areInstructionsWithCopyableElements() ||

          !ScheduleCopyableDataMap.empty())

        CheckIfNeedToClearDeps(Bundle);

      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "

                        << BB->getName() << "\n");

      calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,

                            ControlDependentMembers);

    } else if (!ControlDependentMembers.empty()) {

      ScheduleBundle Invalid = ScheduleBundle::invalid();

      calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,

                            ControlDependentMembers);

    }


    if (ReSchedule) {

      resetSchedule();

      initialFillReadyList(ReadyInsts);

    }


    // Now try to schedule the new bundle or (if no bundle) just calculate

    // dependencies. As soon as the bundle is "ready" it means that there are no

    // cyclic dependencies and we can schedule it. Note that's important that we

    // don't "schedule" the bundle yet.

    while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&

           !ReadyInsts.empty()) {

      ScheduleEntity *Picked = ReadyInsts.pop_back_val();

      assert(Picked->isReady() && "must be ready to schedule");

      schedule(*SLP, S, EI, Picked, ReadyInsts);

      if (Picked == &Bundle)

        break;

    }

  };


  // Make sure that the scheduling region contains all

  // instructions of the bundle.

  for (Value *V : VL) {

    if (S.isNonSchedulable(V))

      continue;

    if (!extendSchedulingRegion(V, S)) {

      // If the scheduling region got new instructions at the lower end (or it

      // is a new region for the first bundle). This makes it necessary to

      // recalculate all dependencies.

      // Otherwise the compiler may crash trying to incorrectly calculate

      // dependencies and emit instruction in the wrong order at the actual

      // scheduling.

      ScheduleBundle Invalid = ScheduleBundle::invalid();

      TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);

      return std::nullopt;

    }

  }


  bool ReSchedule = false;

  for (Value *V : VL) {

    if (S.isNonSchedulable(V))

      continue;

    SmallVector<ScheduleCopyableData *> CopyableData =

        getScheduleCopyableData(cast<Instruction>(V));

    if (!CopyableData.empty()) {

      for (ScheduleCopyableData *SD : CopyableData)

        ReadyInsts.remove(SD);

    }

    ScheduleData *BundleMember = getScheduleData(V);

    assert((BundleMember || S.isCopyableElement(V)) &&

           "no ScheduleData for bundle member (maybe not in same basic block)");

    if (!BundleMember)

      continue;


    // Make sure we don't leave the pieces of the bundle in the ready list when

    // whole bundle might not be ready.

    ReadyInsts.remove(BundleMember);

    if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);

        !Bundles.empty()) {

      for (ScheduleBundle *B : Bundles)

        ReadyInsts.remove(B);

    }


    if (!S.isCopyableElement(V) && !BundleMember->isScheduled())

      continue;

    // A bundle member was scheduled as single instruction before and now

    // needs to be scheduled as part of the bundle. We just get rid of the

    // existing schedule.

    // A bundle member has deps calculated before it was copyable element - need

    // to reschedule.

    LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember

                      << " was already scheduled\n");

    ReSchedule = true;

  }


  ScheduleBundle &Bundle = buildBundle(VL, S, EI);

  TryScheduleBundleImpl(ReSchedule, Bundle);

  if (!Bundle.isReady()) {

    for (ScheduleEntity *BD : Bundle.getBundle()) {

      // Copyable data scheduling is just removed.

      if (isa<ScheduleCopyableData>(BD))

        continue;

      if (BD->isReady()) {

        ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());

        if (Bundles.empty()) {

          ReadyInsts.insert(BD);

          continue;

        }

        for (ScheduleBundle *B : Bundles)

          if (B->isReady())

            ReadyInsts.insert(B);

      }

    }

    ScheduledBundlesList.pop_back();

    SmallVector<ScheduleData *> ControlDependentMembers;

    SmallPtrSet<Instruction *, 4> Visited;

    for (Value *V : VL) {

      if (S.isNonSchedulable(V))

        continue;

      auto *I = cast<Instruction>(V);

      if (S.isCopyableElement(I)) {

        // Remove the copyable data from the scheduling region and restore

        // previous mappings.

        auto KV = std::make_pair(EI, I);

        assert(ScheduleCopyableDataMap.contains(KV) &&

               "no ScheduleCopyableData for copyable element");

        ScheduleCopyableData *SD =

            ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();

        ScheduleCopyableDataMapByUsers[I].remove(SD);

        if (EI.UserTE) {

          ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);

          const auto *It = find(Op, I);

          assert(It != Op.end() && "Lane not set");

          SmallPtrSet<Instruction *, 4> Visited;

          do {

            int Lane = std::distance(Op.begin(), It);

            assert(Lane >= 0 && "Lane not set");

            if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&

                !EI.UserTE->ReorderIndices.empty())

              Lane = EI.UserTE->ReorderIndices[Lane];

            assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&

                   "Couldn't find extract lane");

            auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);

            if (!Visited.insert(In).second) {

              It = find(make_range(std::next(It), Op.end()), I);

              break;

            }

            ScheduleCopyableDataMapByInstUser

                [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]

                    .pop_back();

            It = find(make_range(std::next(It), Op.end()), I);

          } while (It != Op.end());

          EdgeInfo UserEI = EI.UserTE->UserTreeIndex;

          if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))

            ScheduleCopyableDataMapByUsers[I].insert(UserCD);

        }

        if (ScheduleCopyableDataMapByUsers[I].empty())

          ScheduleCopyableDataMapByUsers.erase(I);

        ScheduleCopyableDataMap.erase(KV);

        // Need to recalculate dependencies for the actual schedule data.

        if (ScheduleData *OpSD = getScheduleData(I)) {

          OpSD->clearDirectDependencies();

          if (RegionHasStackSave ||

              !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))

            ControlDependentMembers.push_back(OpSD);

        }

        continue;

      }

      ScheduledBundles.find(I)->getSecond().pop_back();

    }

    if (!ControlDependentMembers.empty()) {

      ScheduleBundle Invalid = ScheduleBundle::invalid();

      calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,

                            ControlDependentMembers);

    }

    return std::nullopt;

  }

  return &Bundle;

}


BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {

  // Allocate a new ScheduleData for the instruction.

  if (ChunkPos >= ChunkSize) {

    ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));

    ChunkPos = 0;

  }

  return &(ScheduleDataChunks.back()[ChunkPos++]);

}


bool BoUpSLP::BlockScheduling::extendSchedulingRegion(

    Value *V, const InstructionsState &S) {

  Instruction *I = dyn_cast<Instruction>(V);

  assert(I && "bundle member must be an instruction");

  if (getScheduleData(I))

    return true;

  if (!ScheduleStart) {

    // It's the first instruction in the new region.

    initScheduleData(I, I->getNextNode(), nullptr, nullptr);

    ScheduleStart = I;

    ScheduleEnd = I->getNextNode();

    assert(ScheduleEnd && "tried to vectorize a terminator?");

    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");

    return true;

  }

  // Search up and down at the same time, because we don't know if the new

  // instruction is above or below the existing scheduling region.

  // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted

  // against the budget. Otherwise debug info could affect codegen.

  BasicBlock::reverse_iterator UpIter =

      ++ScheduleStart->getIterator().getReverse();

  BasicBlock::reverse_iterator UpperEnd = BB->rend();

  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();

  BasicBlock::iterator LowerEnd = BB->end();

  auto IsAssumeLikeIntr = [](const Instruction &I) {

    if (auto *II = dyn_cast<IntrinsicInst>(&I))

      return II->isAssumeLikeIntrinsic();

    return false;

  };

  UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

  DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

  while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&

         &*DownIter != I) {

    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {

      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");

      return false;

    }


    ++UpIter;

    ++DownIter;


    UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

    DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

  }

  if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {

    assert(I->getParent() == ScheduleStart->getParent() &&

           "Instruction is in wrong basic block.");

    initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);

    ScheduleStart = I;

    LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I

                      << "\n");

    return true;

  }

  assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&

         "Expected to reach top of the basic block or instruction down the "

         "lower end.");

  assert(I->getParent() == ScheduleEnd->getParent() &&

         "Instruction is in wrong basic block.");

  initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,

                   nullptr);

  ScheduleEnd = I->getNextNode();

  assert(ScheduleEnd && "tried to vectorize a terminator?");

  LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");

  return true;

}


void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,

                                                Instruction *ToI,

                                                ScheduleData *PrevLoadStore,

                                                ScheduleData *NextLoadStore) {

  ScheduleData *CurrentLoadStore = PrevLoadStore;

  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {

    // No need to allocate data for non-schedulable instructions.

    if (isa<PHINode>(I))

      continue;

    ScheduleData *SD = ScheduleDataMap.lookup(I);

    if (!SD) {

      SD = allocateScheduleDataChunks();

      ScheduleDataMap[I] = SD;

    }

    assert(!isInSchedulingRegion(*SD) &&

           "new ScheduleData already in scheduling region");

    SD->init(SchedulingRegionID, I);


    if (I->mayReadOrWriteMemory() &&

        (!isa<IntrinsicInst>(I) ||

         (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&

          cast<IntrinsicInst>(I)->getIntrinsicID() !=

              Intrinsic::pseudoprobe))) {

      // Update the linked list of memory accessing instructions.

      if (CurrentLoadStore) {

        CurrentLoadStore->setNextLoadStore(SD);

      } else {

        FirstLoadStoreInRegion = SD;

      }

      CurrentLoadStore = SD;

    }


    if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

        match(I, m_Intrinsic<Intrinsic::stackrestore>()))

      RegionHasStackSave = true;

  }

  if (NextLoadStore) {

    if (CurrentLoadStore)

      CurrentLoadStore->setNextLoadStore(NextLoadStore);

  } else {

    LastLoadStoreInRegion = CurrentLoadStore;

  }

}


void BoUpSLP::BlockScheduling::calculateDependencies(

    ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,

    ArrayRef<ScheduleData *> ControlDeps) {

  SmallVector<ScheduleEntity *> WorkList;

  auto ProcessNode = [&](ScheduleEntity *SE) {

    if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {

      if (CD->hasValidDependencies())

        return;

      LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *CD << "\n");

      CD->initDependencies();

      CD->resetUnscheduledDeps();

      const EdgeInfo &EI = CD->getEdgeInfo();

      if (EI.UserTE) {

        ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);

        const auto *It = find(Op, CD->getInst());

        assert(It != Op.end() && "Lane not set");

        SmallPtrSet<Instruction *, 4> Visited;

        do {

          int Lane = std::distance(Op.begin(), It);

          assert(Lane >= 0 && "Lane not set");

          if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&

              !EI.UserTE->ReorderIndices.empty())

            Lane = EI.UserTE->ReorderIndices[Lane];

          assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&

                 "Couldn't find extract lane");

          auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);

          if (EI.UserTE->isCopyableElement(In)) {

            // We may have not have related copyable scheduling data, if the

            // instruction is non-schedulable.

            if (ScheduleCopyableData *UseSD =

                    getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {

              CD->incDependencies();

              if (!UseSD->isScheduled())

                CD->incrementUnscheduledDeps(1);

              if (!UseSD->hasValidDependencies() ||

                  (InsertInReadyList && UseSD->isReady()))

                WorkList.push_back(UseSD);

            }

          } else if (Visited.insert(In).second) {

            if (ScheduleData *UseSD = getScheduleData(In)) {

              CD->incDependencies();

              if (!UseSD->isScheduled())

                CD->incrementUnscheduledDeps(1);

              if (!UseSD->hasValidDependencies() ||

                  (InsertInReadyList && UseSD->isReady()))

                WorkList.push_back(UseSD);

            }

          }

          It = find(make_range(std::next(It), Op.end()), CD->getInst());

        } while (It != Op.end());

        if (CD->isReady() && CD->getDependencies() == 0 &&

            (EI.UserTE->hasState() &&

             (EI.UserTE->getMainOp()->getParent() !=

                  CD->getInst()->getParent() ||

              (isa<PHINode>(EI.UserTE->getMainOp()) &&

               (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||

                any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {

                  auto *IU = dyn_cast<Instruction>(U);

                  if (!IU)

                    return true;

                  return IU->getParent() == EI.UserTE->getMainOp()->getParent();

                })))))) {

          // If no uses in the block - mark as having pseudo-use, which cannot

          // be scheduled.

          // Prevents incorrect def-use tracking between external user and

          // actual instruction.

          CD->incDependencies();

          CD->incrementUnscheduledDeps(1);

        }

      }

      return;

    }

    auto *BundleMember = cast<ScheduleData>(SE);

    if (BundleMember->hasValidDependencies())

      return;

    LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");

    BundleMember->initDependencies();

    BundleMember->resetUnscheduledDeps();

    // Handle def-use chain dependencies.

    SmallDenseMap<Value *, unsigned> UserToNumOps;

    for (User *U : BundleMember->getInst()->users()) {

      if (isa<PHINode>(U))

        continue;

      if (ScheduleData *UseSD = getScheduleData(U)) {

        // The operand is a copyable element - skip.

        unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();

        ++NumOps;

        if (areAllOperandsReplacedByCopyableData(

                cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))

          continue;

        BundleMember->incDependencies();

        if (!UseSD->isScheduled())

          BundleMember->incrementUnscheduledDeps(1);

        if (!UseSD->hasValidDependencies() ||

            (InsertInReadyList && UseSD->isReady()))

          WorkList.push_back(UseSD);

      }

    }

    for (ScheduleCopyableData *UseSD :

         getScheduleCopyableDataUsers(BundleMember->getInst())) {

      BundleMember->incDependencies();

      if (!UseSD->isScheduled())

        BundleMember->incrementUnscheduledDeps(1);

      if (!UseSD->hasValidDependencies() ||

          (InsertInReadyList && UseSD->isReady()))

        WorkList.push_back(UseSD);

    }


    SmallPtrSet<const Instruction *, 4> Visited;

    auto MakeControlDependent = [&](Instruction *I) {

      // Do not mark control dependent twice.

      if (!Visited.insert(I).second)

        return;

      auto *DepDest = getScheduleData(I);

      assert(DepDest && "must be in schedule window");

      DepDest->addControlDependency(BundleMember);

      BundleMember->incDependencies();

      if (!DepDest->isScheduled())

        BundleMember->incrementUnscheduledDeps(1);

      if (!DepDest->hasValidDependencies() ||

          (InsertInReadyList && DepDest->isReady()))

        WorkList.push_back(DepDest);

    };


    // Any instruction which isn't safe to speculate at the beginning of the

    // block is control depend on any early exit or non-willreturn call

    // which proceeds it.

    if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {

      for (Instruction *I = BundleMember->getInst()->getNextNode();

           I != ScheduleEnd; I = I->getNextNode()) {

        if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))

          continue;


        // Add the dependency

        MakeControlDependent(I);


        if (!isGuaranteedToTransferExecutionToSuccessor(I))

          // Everything past here must be control dependent on I.

          break;

      }

    }


    if (RegionHasStackSave) {

      // If we have an inalloc alloca instruction, it needs to be scheduled

      // after any preceeding stacksave.  We also need to prevent any alloca

      // from reordering above a preceeding stackrestore.

      if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||

          match(BundleMember->getInst(),

                m_Intrinsic<Intrinsic::stackrestore>())) {

        for (Instruction *I = BundleMember->getInst()->getNextNode();

             I != ScheduleEnd; I = I->getNextNode()) {

          if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

              match(I, m_Intrinsic<Intrinsic::stackrestore>()))

            // Any allocas past here must be control dependent on I, and I

            // must be memory dependend on BundleMember->Inst.

            break;


          if (!isa<AllocaInst>(I))

            continue;


          // Add the dependency

          MakeControlDependent(I);

        }

      }


      // In addition to the cases handle just above, we need to prevent

      // allocas and loads/stores from moving below a stacksave or a

      // stackrestore. Avoiding moving allocas below stackrestore is currently

      // thought to be conservatism. Moving loads/stores below a stackrestore

      // can lead to incorrect code.

      if (isa<AllocaInst>(BundleMember->getInst()) ||

          BundleMember->getInst()->mayReadOrWriteMemory()) {

        for (Instruction *I = BundleMember->getInst()->getNextNode();

             I != ScheduleEnd; I = I->getNextNode()) {

          if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&

              !match(I, m_Intrinsic<Intrinsic::stackrestore>()))

            continue;


          // Add the dependency

          MakeControlDependent(I);

          break;

        }

      }

    }


    // Handle the memory dependencies (if any).

    ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();

    if (!NextLoadStore)

      return;

    Instruction *SrcInst = BundleMember->getInst();

    assert(SrcInst->mayReadOrWriteMemory() &&

           "NextLoadStore list for non memory effecting bundle?");

    MemoryLocation SrcLoc = getLocation(SrcInst);

    bool SrcMayWrite = SrcInst->mayWriteToMemory();

    unsigned NumAliased = 0;

    unsigned DistToSrc = 1;

    bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);


    for (ScheduleData *DepDest = NextLoadStore; DepDest;

         DepDest = DepDest->getNextLoadStore()) {

      assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");


      // We have two limits to reduce the complexity:

      // 1) AliasedCheckLimit: It's a small limit to reduce calls to

      //    SLP->isAliased (which is the expensive part in this loop).

      // 2) MaxMemDepDistance: It's for very large blocks and it aborts

      //    the whole loop (even if the loop is fast, it's quadratic).

      //    It's important for the loop break condition (see below) to

      //    check this limit even between two read-only instructions.

      if (DistToSrc >= MaxMemDepDistance ||

          ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&

           (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||

            SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {


        // We increment the counter only if the locations are aliased

        // (instead of counting all alias checks). This gives a better

        // balance between reduced runtime and accurate dependencies.

        NumAliased++;


        DepDest->addMemoryDependency(BundleMember);

        BundleMember->incDependencies();

        if (!DepDest->isScheduled())

          BundleMember->incrementUnscheduledDeps(1);

        if (!DepDest->hasValidDependencies() ||

            (InsertInReadyList && DepDest->isReady()))

          WorkList.push_back(DepDest);

      }


      // Example, explaining the loop break condition: Let's assume our

      // starting instruction is i0 and MaxMemDepDistance = 3.

      //

      //                      +--------v--v--v

      //             i0,i1,i2,i3,i4,i5,i6,i7,i8

      //             +--------^--^--^

      //

      // MaxMemDepDistance let us stop alias-checking at i3 and we add

      // dependencies from i0 to i3,i4,.. (even if they are not aliased).

      // Previously we already added dependencies from i3 to i6,i7,i8

      // (because of MaxMemDepDistance). As we added a dependency from

      // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8

      // and we can abort this loop at i6.

      if (DistToSrc >= 2 * MaxMemDepDistance)

        break;

      DistToSrc++;

    }

  };


  assert((Bundle || !ControlDeps.empty()) &&

         "expected at least one instruction to schedule");

  if (Bundle)

    WorkList.push_back(Bundle.getBundle().front());

  WorkList.append(ControlDeps.begin(), ControlDeps.end());

  SmallPtrSet<ScheduleBundle *, 16> Visited;

  while (!WorkList.empty()) {

    ScheduleEntity *SD = WorkList.pop_back_val();

    SmallVector<ScheduleBundle *, 1> CopyableBundle;

    ArrayRef<ScheduleBundle *> Bundles;

    if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {

      CopyableBundle.push_back(&CD->getBundle());

      Bundles = CopyableBundle;

    } else {

      Bundles = getScheduleBundles(SD->getInst());

    }

    if (Bundles.empty()) {

      if (!SD->hasValidDependencies())

        ProcessNode(SD);

      if (InsertInReadyList && SD->isReady()) {

        ReadyInsts.insert(SD);

        LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD << "\n");

      }

      continue;

    }

    for (ScheduleBundle *Bundle : Bundles) {

      if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)

        continue;

      assert(isInSchedulingRegion(*Bundle) &&

             "ScheduleData not in scheduling region");

      for_each(Bundle->getBundle(), ProcessNode);

    }

    if (InsertInReadyList && SD->isReady()) {

      for (ScheduleBundle *Bundle : Bundles) {

        assert(isInSchedulingRegion(*Bundle) &&

               "ScheduleData not in scheduling region");

        if (!Bundle->isReady())

          continue;

        ReadyInsts.insert(Bundle);

        LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *Bundle

                          << "\n");

      }

    }

  }

}


void BoUpSLP::BlockScheduling::resetSchedule() {

  assert(ScheduleStart &&

         "tried to reset schedule on block which has not been scheduled");

  for_each(ScheduleDataMap, [&](auto &P) {

    if (BB != P.first->getParent())

      return;

    ScheduleData *SD = P.second;

    if (isInSchedulingRegion(*SD)) {

      SD->setScheduled(/*Scheduled=*/false);

      SD->resetUnscheduledDeps();

    }

  });

  for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {

    for_each(P.second, [&](ScheduleCopyableData *SD) {

      if (isInSchedulingRegion(*SD)) {

        SD->setScheduled(/*Scheduled=*/false);

        SD->resetUnscheduledDeps();

      }

    });

  });

  for_each(ScheduledBundles, [&](auto &P) {

    for_each(P.second, [&](ScheduleBundle *Bundle) {

      if (isInSchedulingRegion(*Bundle))

        Bundle->setScheduled(/*Scheduled=*/false);

    });

  });

  // Reset schedule data for copyable elements.

  for (auto &P : ScheduleCopyableDataMap) {

    if (isInSchedulingRegion(*P.second)) {

      P.second->setScheduled(/*Scheduled=*/false);

      P.second->resetUnscheduledDeps();

    }

  }

  ReadyInsts.clear();

}


void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {

  if (!BS->ScheduleStart)

    return;


  LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");


  // A key point - if we got here, pre-scheduling was able to find a valid

  // scheduling of the sub-graph of the scheduling window which consists

  // of all vector bundles and their transitive users.  As such, we do not

  // need to reschedule anything *outside of* that subgraph.


  BS->resetSchedule();


  // For the real scheduling we use a more sophisticated ready-list: it is

  // sorted by the original instruction location. This lets the final schedule

  // be as  close as possible to the original instruction order.

  // WARNING: If changing this order causes a correctness issue, that means

  // there is some missing dependence edge in the schedule data graph.

  struct ScheduleDataCompare {

    bool operator()(const ScheduleEntity *SD1,

                    const ScheduleEntity *SD2) const {

      return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();

    }

  };

  std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;


  // Ensure that all dependency data is updated (for nodes in the sub-graph)

  // and fill the ready-list with initial instructions.

  int Idx = 0;

  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;

       I = I->getNextNode()) {

    ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);

    if (!Bundles.empty()) {

      for (ScheduleBundle *Bundle : Bundles) {

        Bundle->setSchedulingPriority(Idx++);

        if (!Bundle->hasValidDependencies())

          BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);

      }

      SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);

      for (ScheduleCopyableData *SD : reverse(SDs)) {

        ScheduleBundle &Bundle = SD->getBundle();

        Bundle.setSchedulingPriority(Idx++);

        if (!Bundle.hasValidDependencies())

          BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);

      }

      continue;

    }

    SmallVector<ScheduleCopyableData *> CopyableData =

        BS->getScheduleCopyableDataUsers(I);

    if (ScheduleData *SD = BS->getScheduleData(I)) {

      [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);

      assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||

              SDTEs.front()->doesNotNeedToSchedule() ||

              doesNotNeedToBeScheduled(I)) &&

             "scheduler and vectorizer bundle mismatch");

      SD->setSchedulingPriority(Idx++);

      if (!SD->hasValidDependencies() &&

          (!CopyableData.empty() ||

           any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {

             assert(TE->isGather() && "expected gather node");

             return TE->hasState() && TE->hasCopyableElements() &&

                    TE->isCopyableElement(I);

           }))) {

        // Need to calculate deps for these nodes to correctly handle copyable

        // dependencies, even if they were cancelled.

        // If copyables bundle was cancelled, the deps are cleared and need to

        // recalculate them.

        ScheduleBundle Bundle;

        Bundle.add(SD);

        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);

      }

    }

    for (ScheduleCopyableData *SD : reverse(CopyableData)) {

      ScheduleBundle &Bundle = SD->getBundle();

      Bundle.setSchedulingPriority(Idx++);

      if (!Bundle.hasValidDependencies())

        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);

    }

  }

  BS->initialFillReadyList(ReadyInsts);


  Instruction *LastScheduledInst = BS->ScheduleEnd;


  // Do the "real" scheduling.

  SmallPtrSet<Instruction *, 16> Scheduled;

  while (!ReadyInsts.empty()) {

    auto *Picked = *ReadyInsts.begin();

    ReadyInsts.erase(ReadyInsts.begin());


    // Move the scheduled instruction(s) to their dedicated places, if not

    // there yet.

    if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {

      for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {

        Instruction *PickedInst = BundleMember->getInst();

        // If copyable must be schedule as part of something else, skip it.

        bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);

        if ((IsCopyable && BS->getScheduleData(PickedInst)) ||

            (!IsCopyable && !Scheduled.insert(PickedInst).second))

          continue;

        if (PickedInst->getNextNode() != LastScheduledInst)

          PickedInst->moveAfter(LastScheduledInst->getPrevNode());

        LastScheduledInst = PickedInst;

      }

      EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),

                                         LastScheduledInst);

    } else {

      auto *SD = cast<ScheduleData>(Picked);

      Instruction *PickedInst = SD->getInst();

      if (PickedInst->getNextNode() != LastScheduledInst)

        PickedInst->moveAfter(LastScheduledInst->getPrevNode());

      LastScheduledInst = PickedInst;

    }

    auto Invalid = InstructionsState::invalid();

    BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);

  }


  // Check that we didn't break any of our invariants.

#ifdef EXPENSIVE_CHECKS

  BS->verify();

#endif


#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)

  // Check that all schedulable entities got scheduled

  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;

       I = I->getNextNode()) {

    ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);

    assert(all_of(Bundles,

                  [](const ScheduleBundle *Bundle) {

                    return Bundle->isScheduled();

                  }) &&

           "must be scheduled at this point");

  }

#endif


  // Avoid duplicate scheduling of the block.

  BS->ScheduleStart = nullptr;

}


unsigned BoUpSLP::getVectorElementSize(Value *V) {

  // If V is a store, just return the width of the stored value (or value

  // truncated just before storing) without traversing the expression tree.

  // This is the common case.

  if (auto *Store = dyn_cast<StoreInst>(V))

    return DL->getTypeSizeInBits(Store->getValueOperand()->getType());


  if (auto *IEI = dyn_cast<InsertElementInst>(V))

    return getVectorElementSize(IEI->getOperand(1));


  auto E = InstrElementSize.find(V);

  if (E != InstrElementSize.end())

    return E->second;


  // If V is not a store, we can traverse the expression tree to find loads

  // that feed it. The type of the loaded value may indicate a more suitable

  // width than V's type. We want to base the vector element size on the width

  // of memory operations where possible.

  SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;

  SmallPtrSet<Instruction *, 16> Visited;

  if (auto *I = dyn_cast<Instruction>(V)) {

    Worklist.emplace_back(I, I->getParent(), 0);

    Visited.insert(I);

  }


  // Traverse the expression tree in bottom-up order looking for loads. If we

  // encounter an instruction we don't yet handle, we give up.

  auto Width = 0u;

  Value *FirstNonBool = nullptr;

  while (!Worklist.empty()) {

    auto [I, Parent, Level] = Worklist.pop_back_val();


    // We should only be looking at scalar instructions here. If the current

    // instruction has a vector type, skip.

    auto *Ty = I->getType();

    if (isa<VectorType>(Ty))

      continue;

    if (Ty != Builder.getInt1Ty() && !FirstNonBool)

      FirstNonBool = I;

    if (Level > RecursionMaxDepth)

      continue;


    // If the current instruction is a load, update MaxWidth to reflect the

    // width of the loaded value.

    if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))

      Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));


    // Otherwise, we need to visit the operands of the instruction. We only

    // handle the interesting cases from buildTree here. If an operand is an

    // instruction we haven't yet visited and from the same basic block as the

    // user or the use is a PHI node, we add it to the worklist.

    else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,

                 BinaryOperator, UnaryOperator>(I)) {

      for (Use &U : I->operands()) {

        if (auto *J = dyn_cast<Instruction>(U.get()))

          if (Visited.insert(J).second &&

              (isa<PHINode>(I) || J->getParent() == Parent)) {

            Worklist.emplace_back(J, J->getParent(), Level + 1);

            continue;

          }

        if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())

          FirstNonBool = U.get();

      }

    } else {

      break;

    }

  }


  // If we didn't encounter a memory access in the expression tree, or if we

  // gave up for some reason, just return the width of V. Otherwise, return the

  // maximum width we found.

  if (!Width) {

    if (V->getType() == Builder.getInt1Ty() && FirstNonBool)

      V = FirstNonBool;

    Width = DL->getTypeSizeInBits(V->getType());

  }


  for (Instruction *I : Visited)

    InstrElementSize[I] = Width;


  return Width;

}


bool BoUpSLP::collectValuesToDemote(

    const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,

    SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,

    const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,

    bool &IsProfitableToDemote, bool IsTruncRoot) const {

  // We can always demote constants.

  if (all_of(E.Scalars, IsaPred<Constant>))

    return true;


  unsigned OrigBitWidth =

      DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());

  if (OrigBitWidth == BitWidth) {

    MaxDepthLevel = 1;

    return true;

  }


  // Check if the node was analyzed already and must keep its original bitwidth.

  if (NodesToKeepBWs.contains(E.Idx))

    return false;


  // If the value is not a vectorized instruction in the expression and not used

  // by the insertelement instruction and not used in multiple vector nodes, it

  // cannot be demoted.

  bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {

    if (isa<PoisonValue>(R))

      return false;

    return !isKnownNonNegative(R, SimplifyQuery(*DL));

  });

  auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {

    if (isa<PoisonValue>(V))

      return true;

    if (getTreeEntries(V).size() > 1)

      return false;

    // For lat shuffle of sext/zext with many uses need to check the extra bit

    // for unsigned values, otherwise may have incorrect casting for reused

    // scalars.

    bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));

    if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {

      APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

      if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))

        return true;

    }

    unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);

    unsigned BitWidth1 = OrigBitWidth - NumSignBits;

    if (IsSignedNode)

      ++BitWidth1;

    if (auto *I = dyn_cast<Instruction>(V)) {

      APInt Mask = DB->getDemandedBits(I);

      unsigned BitWidth2 =

          std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());

      while (!IsSignedNode && BitWidth2 < OrigBitWidth) {

        APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);

        if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))

          break;

        BitWidth2 *= 2;

      }

      BitWidth1 = std::min(BitWidth1, BitWidth2);

    }

    BitWidth = std::max(BitWidth, BitWidth1);

    return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);

  };

  auto FinalAnalysis = [&, TTI = TTI]() {

    if (!IsProfitableToDemote)

      return false;

    bool Res = all_of(

        E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));

    // Demote gathers.

    if (Res && E.isGather()) {

      if (E.hasState()) {

        if (const TreeEntry *SameTE =

                getSameValuesTreeEntry(E.getMainOp(), E.Scalars);

            SameTE)

          if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,

                                    ToDemote, Visited, NodesToKeepBWs,

                                    MaxDepthLevel, IsProfitableToDemote,

                                    IsTruncRoot)) {

            ToDemote.push_back(E.Idx);

            return true;

          }

      }

      // Check possible extractelement instructions bases and final vector

      // length.

      SmallPtrSet<Value *, 4> UniqueBases;

      for (Value *V : E.Scalars) {

        auto *EE = dyn_cast<ExtractElementInst>(V);

        if (!EE)

          continue;

        UniqueBases.insert(EE->getVectorOperand());

      }

      const unsigned VF = E.Scalars.size();

      Type *OrigScalarTy = E.Scalars.front()->getType();

      if (UniqueBases.size() <= 2 ||

          ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=

              ::getNumberOfParts(

                  *TTI,

                  getWidenedType(

                      IntegerType::get(OrigScalarTy->getContext(), BitWidth),

                      VF))) {

        ToDemote.push_back(E.Idx);

        return true;

      }

    }

    return Res;

  };

  if (E.isGather() || !Visited.insert(&E).second ||

      any_of(E.Scalars, [&](Value *V) {

        return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {

          return isa<InsertElementInst>(U) && !isVectorized(U);

        });

      }))

    return FinalAnalysis();


  if (any_of(E.Scalars, [&](Value *V) {

        return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {

          return isVectorized(U) ||

                 (E.Idx == 0 && UserIgnoreList &&

                  UserIgnoreList->contains(U)) ||

                 (!isa<CmpInst>(U) && U->getType()->isSized() &&

                  !U->getType()->isScalableTy() &&

                  DL->getTypeSizeInBits(U->getType()) <= BitWidth);

        }) && !IsPotentiallyTruncated(V, BitWidth);

      }))

    return false;


  auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,

                             bool &NeedToExit) {

    NeedToExit = false;

    unsigned InitLevel = MaxDepthLevel;

    for (const TreeEntry *Op : Operands) {

      unsigned Level = InitLevel;

      if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,

                                 ToDemote, Visited, NodesToKeepBWs, Level,

                                 IsProfitableToDemote, IsTruncRoot)) {

        if (!IsProfitableToDemote)

          return false;

        NeedToExit = true;

        if (!FinalAnalysis())

          return false;

        continue;

      }

      MaxDepthLevel = std::max(MaxDepthLevel, Level);

    }

    return true;

  };

  auto AttemptCheckBitwidth =

      [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {

        // Try all bitwidth < OrigBitWidth.

        NeedToExit = false;

        unsigned BestFailBitwidth = 0;

        for (; BitWidth < OrigBitWidth; BitWidth *= 2) {

          if (Checker(BitWidth, OrigBitWidth))

            return true;

          if (BestFailBitwidth == 0 && FinalAnalysis())

            BestFailBitwidth = BitWidth;

        }

        if (BitWidth >= OrigBitWidth) {

          if (BestFailBitwidth == 0) {

            BitWidth = OrigBitWidth;

            return false;

          }

          MaxDepthLevel = 1;

          BitWidth = BestFailBitwidth;

          NeedToExit = true;

          return true;

        }

        return false;

      };

  auto TryProcessInstruction =

      [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},

          function_ref<bool(unsigned, unsigned)> Checker = {}) {

        if (Operands.empty()) {

          if (!IsTruncRoot)

            MaxDepthLevel = 1;

          for (Value *V : E.Scalars)

            (void)IsPotentiallyTruncated(V, BitWidth);

        } else {

          // Several vectorized uses? Check if we can truncate it, otherwise -

          // exit.

          if (any_of(E.Scalars, [&](Value *V) {

                return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);

              }))

            return false;

          bool NeedToExit = false;

          if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))

            return false;

          if (NeedToExit)

            return true;

          if (!ProcessOperands(Operands, NeedToExit))

            return false;

          if (NeedToExit)

            return true;

        }


        ++MaxDepthLevel;

        // Record the entry that we can demote.

        ToDemote.push_back(E.Idx);

        return IsProfitableToDemote;

      };


  if (E.State == TreeEntry::SplitVectorize)

    return TryProcessInstruction(

        BitWidth,

        {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),

         VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});


  switch (E.getOpcode()) {


  // We can always demote truncations and extensions. Since truncations can

  // seed additional demotion, we save the truncated value.

  case Instruction::Trunc:

    if (IsProfitableToDemoteRoot)

      IsProfitableToDemote = true;

    return TryProcessInstruction(BitWidth);

  case Instruction::ZExt:

  case Instruction::SExt:

    IsProfitableToDemote = true;

    return TryProcessInstruction(BitWidth);


  // We can demote certain binary operations if we can demote both of their

  // operands.

  case Instruction::Add:

  case Instruction::Sub:

  case Instruction::Mul:

  case Instruction::And:

  case Instruction::Or:

  case Instruction::Xor: {

    return TryProcessInstruction(

        BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});

  }

  case Instruction::Freeze:

    return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));

  case Instruction::Shl: {

    // If we are truncating the result of this SHL, and if it's a shift of an

    // inrange amount, we can always perform a SHL in a smaller type.

    auto ShlChecker = [&](unsigned BitWidth, unsigned) {

      return all_of(E.Scalars, [&](Value *V) {

        if (isa<PoisonValue>(V))

          return true;

        auto *I = cast<Instruction>(V);

        KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

        return AmtKnownBits.getMaxValue().ult(BitWidth);

      });

    };

    return TryProcessInstruction(

        BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);

  }

  case Instruction::LShr: {

    // If this is a truncate of a logical shr, we can truncate it to a smaller

    // lshr iff we know that the bits we would otherwise be shifting in are

    // already zeros.

    auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {

      return all_of(E.Scalars, [&](Value *V) {

        if (isa<PoisonValue>(V))

          return true;

        APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

        if (E.isCopyableElement(V))

          return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));

        auto *I = cast<Instruction>(V);

        KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

        return AmtKnownBits.getMaxValue().ult(BitWidth) &&

               MaskedValueIsZero(I->getOperand(0), ShiftedBits,

                                 SimplifyQuery(*DL));

      });

    };

    return TryProcessInstruction(

        BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},

        LShrChecker);

  }

  case Instruction::AShr: {

    // If this is a truncate of an arithmetic shr, we can truncate it to a

    // smaller ashr iff we know that all the bits from the sign bit of the

    // original type and the sign bit of the truncate type are similar.

    auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {

      return all_of(E.Scalars, [&](Value *V) {

        if (isa<PoisonValue>(V))

          return true;

        auto *I = cast<Instruction>(V);

        KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

        unsigned ShiftedBits = OrigBitWidth - BitWidth;

        return AmtKnownBits.getMaxValue().ult(BitWidth) &&

               ShiftedBits <

                   ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);

      });

    };

    return TryProcessInstruction(

        BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},

        AShrChecker);

  }

  case Instruction::UDiv:

  case Instruction::URem: {

    // UDiv and URem can be truncated if all the truncated bits are zero.

    auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {

      assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");

      return all_of(E.Scalars, [&](Value *V) {

        auto *I = cast<Instruction>(V);

        APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

        return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&

               MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));

      });

    };

    return TryProcessInstruction(

        BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);

  }


  // We can demote selects if we can demote their true and false values.

  case Instruction::Select: {

    return TryProcessInstruction(

        BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});

  }


  // We can demote phis if we can demote all their incoming operands.

  case Instruction::PHI: {

    const unsigned NumOps = E.getNumOperands();

    SmallVector<const TreeEntry *> Ops(NumOps);

    transform(seq<unsigned>(0, NumOps), Ops.begin(),

              [&](unsigned Idx) { return getOperandEntry(&E, Idx); });


    return TryProcessInstruction(BitWidth, Ops);

  }


  case Instruction::Call: {

    auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());

    if (!IC)

      break;

    Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI);

    if (ID != Intrinsic::abs && ID != Intrinsic::smin &&

        ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)

      break;

    SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));

    function_ref<bool(unsigned, unsigned)> CallChecker;

    auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {

      assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");

      return all_of(E.Scalars, [&](Value *V) {

        auto *I = cast<Instruction>(V);

        if (ID == Intrinsic::umin || ID == Intrinsic::umax) {

          APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

          return MaskedValueIsZero(I->getOperand(0), Mask,

                                   SimplifyQuery(*DL)) &&

                 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));

        }

        assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&

               "Expected min/max intrinsics only.");

        unsigned SignBits = OrigBitWidth - BitWidth;

        APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);

        unsigned Op0SignBits =

            ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);

        unsigned Op1SignBits =

            ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);

        return SignBits <= Op0SignBits &&

               ((SignBits != Op0SignBits &&

                 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||

                MaskedValueIsZero(I->getOperand(0), Mask,

                                  SimplifyQuery(*DL))) &&

               SignBits <= Op1SignBits &&

               ((SignBits != Op1SignBits &&

                 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||

                MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));

      });

    };

    auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {

      assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");

      return all_of(E.Scalars, [&](Value *V) {

        auto *I = cast<Instruction>(V);

        unsigned SignBits = OrigBitWidth - BitWidth;

        APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);

        unsigned Op0SignBits =

            ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);

        return SignBits <= Op0SignBits &&

               ((SignBits != Op0SignBits &&

                 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||

                MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));

      });

    };

    if (ID != Intrinsic::abs) {

      Operands.push_back(getOperandEntry(&E, 1));

      CallChecker = CompChecker;

    } else {

      CallChecker = AbsChecker;

    }

    InstructionCost BestCost =

        std::numeric_limits<InstructionCost::CostType>::max();

    unsigned BestBitWidth = BitWidth;

    unsigned VF = E.Scalars.size();

    // Choose the best bitwidth based on cost estimations.

    auto Checker = [&](unsigned BitWidth, unsigned) {

      unsigned MinBW = PowerOf2Ceil(BitWidth);

      SmallVector<Type *> ArgTys =

          buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);

      auto VecCallCosts = getVectorCallCosts(

          IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),

          TTI, TLI, ArgTys);

      InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);

      if (Cost < BestCost) {

        BestCost = Cost;

        BestBitWidth = BitWidth;

      }

      return false;

    };

    [[maybe_unused]] bool NeedToExit;

    (void)AttemptCheckBitwidth(Checker, NeedToExit);

    BitWidth = BestBitWidth;

    return TryProcessInstruction(BitWidth, Operands, CallChecker);

  }


  // Otherwise, conservatively give up.

  default:

    break;

  }

  MaxDepthLevel = 1;

  return FinalAnalysis();

}


static RecurKind getRdxKind(Value *V);


void BoUpSLP::computeMinimumValueSizes() {

  // We only attempt to truncate integer expressions.

  bool IsStoreOrInsertElt =

      VectorizableTree.front()->hasState() &&

      (VectorizableTree.front()->getOpcode() == Instruction::Store ||

       VectorizableTree.front()->getOpcode() == Instruction::InsertElement);

  if ((IsStoreOrInsertElt || UserIgnoreList) &&

      ExtraBitWidthNodes.size() <= 1 &&

      (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||

       CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))

    return;


  unsigned NodeIdx = 0;

  if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())

    NodeIdx = 1;


  // Ensure the roots of the vectorizable tree don't form a cycle.

  assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||

          !VectorizableTree[NodeIdx]->UserTreeIndex) &&

         "Unexpected tree is graph.");


  // The first value node for store/insertelement is sext/zext/trunc? Skip it,

  // resize to the final type.

  bool IsTruncRoot = false;

  bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;

  SmallVector<unsigned> RootDemotes;

  SmallDenseSet<unsigned, 8> NodesToKeepBWs;

  if (NodeIdx != 0 &&

      VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&

      VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {

    assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");

    IsTruncRoot = true;

    RootDemotes.push_back(NodeIdx);

    IsProfitableToDemoteRoot = true;

    ++NodeIdx;

  }


  // Analyzed the reduction already and not profitable - exit.

  if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))

    return;


  SmallVector<unsigned> ToDemote;

  auto ComputeMaxBitWidth =

      [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,

          unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {

    ToDemote.clear();

    // Check if the root is trunc and the next node is gather/buildvector, then

    // keep trunc in scalars, which is free in most cases.

    if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&

        !NodesToKeepBWs.contains(E.Idx) &&

        E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&

        all_of(E.Scalars, [&](Value *V) {

          return V->hasOneUse() || isa<Constant>(V) ||

                 (!V->hasNUsesOrMore(UsesLimit) &&

                  none_of(V->users(), [&](User *U) {

                    ArrayRef<TreeEntry *> TEs = getTreeEntries(U);

                    const TreeEntry *UserTE = E.UserTreeIndex.UserTE;

                    if (TEs.empty() || is_contained(TEs, UserTE))

                      return false;

                    if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,

                             SelectInst>(U) ||

                        isa<SIToFPInst, UIToFPInst>(U) ||

                        (UserTE->hasState() &&

                         (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,

                               SelectInst>(UserTE->getMainOp()) ||

                          isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))

                      return true;

                    unsigned UserTESz = DL->getTypeSizeInBits(

                        UserTE->Scalars.front()->getType());

                    if (all_of(TEs, [&](const TreeEntry *TE) {

                          auto It = MinBWs.find(TE);

                          return It != MinBWs.end() &&

                                 It->second.first > UserTESz;

                        }))

                      return true;

                    return DL->getTypeSizeInBits(U->getType()) > UserTESz;

                  }));

        })) {

      ToDemote.push_back(E.Idx);

      const TreeEntry *UserTE = E.UserTreeIndex.UserTE;

      auto It = MinBWs.find(UserTE);

      if (It != MinBWs.end())

        return It->second.first;

      unsigned MaxBitWidth =

          DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());

      MaxBitWidth = bit_ceil(MaxBitWidth);

      if (MaxBitWidth < 8 && MaxBitWidth > 1)

        MaxBitWidth = 8;

      return MaxBitWidth;

    }


    if (!E.hasState())

      return 0u;


    unsigned VF = E.getVectorFactor();

    Type *ScalarTy = E.Scalars.front()->getType();

    unsigned ScalarTyNumElements = getNumElements(ScalarTy);

    auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());

    if (!TreeRootIT)

      return 0u;


    if (any_of(E.Scalars,

               [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))

      return 0u;


    unsigned NumParts = ::getNumberOfParts(

        *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));


    // The maximum bit width required to represent all the values that can be

    // demoted without loss of precision. It would be safe to truncate the roots

    // of the expression to this width.

    unsigned MaxBitWidth = 1u;


    // True if the roots can be zero-extended back to their original type,

    // rather than sign-extended. We know that if the leading bits are not

    // demanded, we can safely zero-extend. So we initialize IsKnownPositive to

    // True.

    // Determine if the sign bit of all the roots is known to be zero. If not,

    // IsKnownPositive is set to False.

    bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {

      if (isa<PoisonValue>(R))

        return true;

      KnownBits Known = computeKnownBits(R, *DL);

      return Known.isNonNegative();

    });


    if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&

        E.UserTreeIndex.UserTE->hasState() &&

        E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)

      MaxBitWidth =

          std::min(DL->getTypeSizeInBits(

                       E.UserTreeIndex.UserTE->Scalars.front()->getType()),

                   DL->getTypeSizeInBits(ScalarTy));


    // We first check if all the bits of the roots are demanded. If they're not,

    // we can truncate the roots to this narrower type.

    for (Value *Root : E.Scalars) {

      if (isa<PoisonValue>(Root))

        continue;

      unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);

      TypeSize NumTypeBits =

          DL->getTypeSizeInBits(Root->getType()->getScalarType());

      unsigned BitWidth1 = NumTypeBits - NumSignBits;

      // If we can't prove that the sign bit is zero, we must add one to the

      // maximum bit width to account for the unknown sign bit. This preserves

      // the existing sign bit so we can safely sign-extend the root back to the

      // original type. Otherwise, if we know the sign bit is zero, we will

      // zero-extend the root instead.

      //

      // FIXME: This is somewhat suboptimal, as there will be cases where adding

      //        one to the maximum bit width will yield a larger-than-necessary

      //        type. In general, we need to add an extra bit only if we can't

      //        prove that the upper bit of the original type is equal to the

      //        upper bit of the proposed smaller type. If these two bits are

      //        the same (either zero or one) we know that sign-extending from

      //        the smaller type will result in the same value. Here, since we

      //        can't yet prove this, we are just making the proposed smaller

      //        type larger to ensure correctness.

      if (!IsKnownPositive)

        ++BitWidth1;


      auto *I = dyn_cast<Instruction>(Root);

      if (!I) {

        MaxBitWidth = std::max(BitWidth1, MaxBitWidth);

        continue;

      }

      APInt Mask = DB->getDemandedBits(I);

      unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();

      MaxBitWidth =

          std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);

    }


    if (MaxBitWidth < 8 && MaxBitWidth > 1)

      MaxBitWidth = 8;


    // If the original type is large, but reduced type does not improve the reg

    // use - ignore it.

    if (NumParts > 1 &&

        NumParts ==

            ::getNumberOfParts(

                *TTI, getWidenedType(IntegerType::get(F->getContext(),

                                                      bit_ceil(MaxBitWidth)),

                                     VF)))

      return 0u;


    unsigned Opcode = E.getOpcode();

    bool IsProfitableToDemote = Opcode == Instruction::Trunc ||

                                Opcode == Instruction::SExt ||

                                Opcode == Instruction::ZExt || NumParts > 1;

    // Conservatively determine if we can actually truncate the roots of the

    // expression. Collect the values that can be demoted in ToDemote and

    // additional roots that require investigating in Roots.

    DenseSet<const TreeEntry *> Visited;

    unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;

    bool NeedToDemote = IsProfitableToDemote;


    if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,

                               ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,

                               NeedToDemote, IsTruncRoot) ||

        (MaxDepthLevel <= Limit &&

         !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&

            (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||

             DL->getTypeSizeInBits(TreeRootIT) /

                     DL->getTypeSizeInBits(

                         E.getMainOp()->getOperand(0)->getType()) >

                 2)))))

      return 0u;

    // Round MaxBitWidth up to the next power-of-two.

    MaxBitWidth = bit_ceil(MaxBitWidth);


    return MaxBitWidth;

  };


  // If we can truncate the root, we must collect additional values that might

  // be demoted as a result. That is, those seeded by truncations we will

  // modify.

  // Add reduction ops sizes, if any.

  if (UserIgnoreList &&

      isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {

    // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n

    // x i1> to in)).

    if (all_of(*UserIgnoreList,

               [](Value *V) {

                 return isa<PoisonValue>(V) ||

                        cast<Instruction>(V)->getOpcode() == Instruction::Add;

               }) &&

        VectorizableTree.front()->State == TreeEntry::Vectorize &&

        VectorizableTree.front()->getOpcode() == Instruction::ZExt &&

        cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==

            Builder.getInt1Ty()) {

      ReductionBitWidth = 1;

    } else {

      for (Value *V : *UserIgnoreList) {

        if (isa<PoisonValue>(V))

          continue;

        unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);

        TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());

        unsigned BitWidth1 = NumTypeBits - NumSignBits;

        if (!isKnownNonNegative(V, SimplifyQuery(*DL)))

          ++BitWidth1;

        unsigned BitWidth2 = BitWidth1;

        if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {

          APInt Mask = DB->getDemandedBits(cast<Instruction>(V));

          BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();

        }

        ReductionBitWidth =

            std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);

      }

      if (ReductionBitWidth < 8 && ReductionBitWidth > 1)

        ReductionBitWidth = 8;


      ReductionBitWidth = bit_ceil(ReductionBitWidth);

    }

  }

  bool IsTopRoot = NodeIdx == 0;

  while (NodeIdx < VectorizableTree.size() &&

         VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&

         VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {

    RootDemotes.push_back(NodeIdx);

    ++NodeIdx;

    IsTruncRoot = true;

  }

  bool IsSignedCmp = false;

  if (UserIgnoreList && all_of(*UserIgnoreList, [](Value *V) {

        return match(V, m_SMin(m_Value(), m_Value())) ||

               match(V, m_SMax(m_Value(), m_Value()));

      }))

    IsSignedCmp = true;

  while (NodeIdx < VectorizableTree.size()) {

    ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;

    unsigned Limit = 2;

    if (IsTopRoot &&

        ReductionBitWidth ==

            DL->getTypeSizeInBits(

                VectorizableTree.front()->Scalars.front()->getType()))

      Limit = 3;

    unsigned MaxBitWidth = ComputeMaxBitWidth(

        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,

        IsTruncRoot, IsSignedCmp);

    if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {

      if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)

        ReductionBitWidth = bit_ceil(MaxBitWidth);

      else if (MaxBitWidth == 0)

        ReductionBitWidth = 0;

    }


    for (unsigned Idx : RootDemotes) {

      if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {

            uint32_t OrigBitWidth =

                DL->getTypeSizeInBits(V->getType()->getScalarType());

            if (OrigBitWidth > MaxBitWidth) {

              APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);

              return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));

            }

            return false;

          }))

        ToDemote.push_back(Idx);

    }

    RootDemotes.clear();

    IsTopRoot = false;

    IsProfitableToDemoteRoot = true;


    if (ExtraBitWidthNodes.empty()) {

      NodeIdx = VectorizableTree.size();

    } else {

      unsigned NewIdx = 0;

      do {

        NewIdx = *ExtraBitWidthNodes.begin();

        ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());

      } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());

      NodeIdx = NewIdx;

      IsTruncRoot =

          NodeIdx < VectorizableTree.size() &&

          VectorizableTree[NodeIdx]->UserTreeIndex &&

          VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&

          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&

          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==

              Instruction::Trunc &&

          !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();

      IsSignedCmp =

          NodeIdx < VectorizableTree.size() &&

          VectorizableTree[NodeIdx]->UserTreeIndex &&

          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&

          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==

              Instruction::ICmp &&

          any_of(

              VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,

              [&](Value *V) {

                auto *IC = dyn_cast<ICmpInst>(V);

                return IC && (IC->isSigned() ||

                              !isKnownNonNegative(IC->getOperand(0),

                                                  SimplifyQuery(*DL)) ||

                              !isKnownNonNegative(IC->getOperand(1),

                                                  SimplifyQuery(*DL)));

              });

    }


    // If the maximum bit width we compute is less than the width of the roots'

    // type, we can proceed with the narrowing. Otherwise, do nothing.

    if (MaxBitWidth == 0 ||

        MaxBitWidth >=

            cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())

                ->getBitWidth()) {

      if (UserIgnoreList)

        AnalyzedMinBWVals.insert_range(TreeRoot);

      NodesToKeepBWs.insert_range(ToDemote);

      continue;

    }


    // Finally, map the values we can demote to the maximum bit with we

    // computed.

    for (unsigned Idx : ToDemote) {

      TreeEntry *TE = VectorizableTree[Idx].get();

      if (MinBWs.contains(TE))

        continue;

      bool IsSigned = any_of(TE->Scalars, [&](Value *R) {

        if (isa<PoisonValue>(R))

          return false;

        return !isKnownNonNegative(R, SimplifyQuery(*DL));

      });

      MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);

    }

  }

}


PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);

  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);

  auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);

  auto *AA = &AM.getResult<AAManager>(F);

  auto *LI = &AM.getResult<LoopAnalysis>(F);

  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);

  auto *AC = &AM.getResult<AssumptionAnalysis>(F);

  auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);

  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);


  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);

  if (!Changed)

    return PreservedAnalyses::all();


  PreservedAnalyses PA;

  PA.preserveSet<CFGAnalyses>();

  return PA;

}


bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

                                TargetTransformInfo *TTI_,

                                TargetLibraryInfo *TLI_, AAResults *AA_,

                                LoopInfo *LI_, DominatorTree *DT_,

                                AssumptionCache *AC_, DemandedBits *DB_,

                                OptimizationRemarkEmitter *ORE_) {

  if (!RunSLPVectorization)

    return false;

  SE = SE_;

  TTI = TTI_;

  TLI = TLI_;

  AA = AA_;

  LI = LI_;

  DT = DT_;

  AC = AC_;

  DB = DB_;

  DL = &F.getDataLayout();


  Stores.clear();

  GEPs.clear();

  bool Changed = false;


  // If the target claims to have no vector registers don't attempt

  // vectorization.

  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {

    LLVM_DEBUG(

        dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");

    return false;

  }


  // Don't vectorize when the attribute NoImplicitFloat is used.

  if (F.hasFnAttribute(Attribute::NoImplicitFloat))

    return false;


  LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");


  // Use the bottom up slp vectorizer to construct chains that start with

  // store instructions.

  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);


  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to

  // delete instructions.


  // Update DFS numbers now so that we can use them for ordering.

  DT->updateDFSNumbers();


  // Scan the blocks in the function in post order.

  for (auto *BB : post_order(&F.getEntryBlock())) {

    if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))

      continue;


    // Start new block - clear the list of reduction roots.

    R.clearReductionData();

    collectSeedInstructions(BB);


    // Vectorize trees that end at stores.

    if (!Stores.empty()) {

      LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()

                        << " underlying objects.\n");

      Changed |= vectorizeStoreChains(R);

    }


    // Vectorize trees that end at reductions.

    Changed |= vectorizeChainsInBlock(BB, R);


    // Vectorize the index computations of getelementptr instructions. This

    // is primarily intended to catch gather-like idioms ending at

    // non-consecutive loads.

    if (!GEPs.empty()) {

      LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()

                        << " underlying objects.\n");

      Changed |= vectorizeGEPIndices(BB, R);

    }

  }


  if (Changed) {

    R.optimizeGatherSequence();

    LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");

  }

  return Changed;

}


std::optional<bool>

SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

                                       unsigned Idx, unsigned MinVF,

                                       unsigned &Size) {

  Size = 0;

  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()

                    << "\n");

  const unsigned Sz = R.getVectorElementSize(Chain[0]);

  unsigned VF = Chain.size();


  if (!has_single_bit(Sz) ||

      !hasFullVectorsOrPowerOf2(

          *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),

          VF) ||

      VF < 2 || VF < MinVF) {

    // Check if vectorizing with a non-power-of-2 VF should be considered. At

    // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost

    // all vector lanes are used.

    if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))

      return false;

  }


  LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx

                    << "\n");


  SetVector<Value *> ValOps;

  for (Value *V : Chain)

    ValOps.insert(cast<StoreInst>(V)->getValueOperand());

  // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.

  InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);

  InstructionsState S = Analysis.buildInstructionsState(

      ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);

  if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {

    DenseSet<Value *> Stores(Chain.begin(), Chain.end());

    bool IsAllowedSize =

        hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),

                                 ValOps.size()) ||

        (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));

    if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&

         (!S.getMainOp()->isSafeToRemove() ||

          any_of(ValOps.getArrayRef(),

                 [&](Value *V) {

                   return !isa<ExtractElementInst>(V) &&

                          (V->getNumUses() > Chain.size() ||

                           any_of(V->users(), [&](User *U) {

                             return !Stores.contains(U);

                           }));

                 }))) ||

        (ValOps.size() > Chain.size() / 2 && !S)) {

      Size = (!IsAllowedSize && S) ? 1 : 2;

      return false;

    }

  }

  if (R.isLoadCombineCandidate(Chain))

    return true;

  R.buildTree(Chain);

  // Check if tree tiny and store itself or its value is not vectorized.

  if (R.isTreeTinyAndNotFullyVectorizable()) {

    if (R.isGathered(Chain.front()) ||

        R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))

      return std::nullopt;

    Size = R.getCanonicalGraphSize();

    return false;

  }

  if (R.isProfitableToReorder()) {

    R.reorderTopToBottom();

    R.reorderBottomToTop();

  }

  R.transformNodes();

  R.buildExternalUses();


  R.computeMinimumValueSizes();


  Size = R.getCanonicalGraphSize();

  if (S && S.getOpcode() == Instruction::Load)

    Size = 2; // cut off masked gather small trees

  InstructionCost Cost = R.getTreeCost();


  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");

  if (Cost < -SLPCostThreshold) {

    LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");


    using namespace ore;


    R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",

                                        cast<StoreInst>(Chain[0]))

                     << "Stores SLP vectorized with cost " << NV("Cost", Cost)

                     << " and with tree size "

                     << NV("TreeSize", R.getTreeSize()));


    R.vectorizeTree();

    return true;

  }


  return false;

}


/// Checks if the quadratic mean deviation is less than 90% of the mean size.

static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,

                           bool First) {

  unsigned Num = 0;

  uint64_t Sum = std::accumulate(

      Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),

      [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {

        unsigned Size = First ? Val.first : Val.second;

        if (Size == 1)

          return V;

        ++Num;

        return V + Size;

      });

  if (Num == 0)

    return true;

  uint64_t Mean = Sum / Num;

  if (Mean == 0)

    return true;

  uint64_t Dev = std::accumulate(

                     Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),

                     [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {

                       unsigned P = First ? Val.first : Val.second;

                       if (P == 1)

                         return V;

                       return V + (P - Mean) * (P - Mean);

                     }) /

                 Num;

  return Dev * 96 / (Mean * Mean) == 0;

}


namespace {


/// A group of stores that we'll try to bundle together using vector ops.

/// They are ordered using the signed distance of their address operand to the

/// address of this group's BaseInstr.

class RelatedStoreInsts {

public:

  RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)

      : AllStores(AllStores) {

    reset(BaseInstrIdx);

  }


  void reset(unsigned NewBaseInstr) {

    assert(NewBaseInstr < AllStores.size() &&

           "Instruction index out of bounds");

    BaseInstrIdx = NewBaseInstr;

    Instrs.clear();

    insertOrLookup(NewBaseInstr, 0);

  }


  /// Tries to insert \p InstrIdx as the store with a pointer distance of

  /// \p PtrDist.

  /// Does nothing if there is already a store with that \p PtrDist.

  /// \returns The previously associated Instruction index, or std::nullopt

  std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {

    auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);

    return Inserted ? std::nullopt : std::make_optional(It->second);

  }


  using DistToInstMap = std::map<int64_t, unsigned>;

  const DistToInstMap &getStores() const { return Instrs; }


  /// If \p SI is related to this group of stores, return the distance of its

  /// pointer operand to the one the group's BaseInstr.

  std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,

                                        ScalarEvolution &SE) const {

    StoreInst &BaseStore = *AllStores[BaseInstrIdx];

    return getPointersDiff(

        BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),

        SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,

        /*StrictCheck=*/true);

  }


  /// Recompute the pointer distances to be based on \p NewBaseInstIdx.

  /// Stores whose index is less than \p MinSafeIdx will be dropped.

  void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,

              int64_t DistFromCurBase) {

    DistToInstMap PrevSet = std::move(Instrs);

    reset(NewBaseInstIdx);


    // Re-insert stores that come after MinSafeIdx to try and vectorize them

    // again. Their distance will be "rebased" to use NewBaseInstIdx as

    // reference.

    for (auto [Dist, InstIdx] : PrevSet) {

      if (InstIdx >= MinSafeIdx)

        insertOrLookup(InstIdx, Dist - DistFromCurBase);

    }

  }


  /// Remove all stores that have been vectorized from this group.

  void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {

    DistToInstMap::reverse_iterator LastVectorizedStore = find_if(

        reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {

          return VectorizedStores.contains(AllStores[DistAndIdx.second]);

        });


    // Get a forward iterator pointing after the last vectorized store and erase

    // all stores before it so we don't try to vectorize them again.

    DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();

    Instrs.erase(Instrs.begin(), VectorizedStoresEnd);

  }


private:

  /// The index of the Base instruction, i.e. the one with a 0 pointer distance.

  unsigned BaseInstrIdx;


  /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.

  DistToInstMap Instrs;


  /// Reference to all the stores in the BB being analyzed.

  ArrayRef<StoreInst *> AllStores;

};


} // end anonymous namespace


bool SLPVectorizerPass::vectorizeStores(

    ArrayRef<StoreInst *> Stores, BoUpSLP &R,

    DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>

        &Visited) {

  // We may run into multiple chains that merge into a single chain. We mark the

  // stores that we vectorized so that we don't visit the same store twice.

  BoUpSLP::ValueSet VectorizedStores;

  bool Changed = false;


  auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {

    int64_t PrevDist = -1;

    BoUpSLP::ValueList Operands;

    // Collect the chain into a list.

    for (auto [Idx, Data] : enumerate(StoreSeq)) {

      auto &[Dist, InstIdx] = Data;

      if (Operands.empty() || Dist - PrevDist == 1) {

        Operands.push_back(Stores[InstIdx]);

        PrevDist = Dist;

        if (Idx != StoreSeq.size() - 1)

          continue;

      }

      auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {

        Operands.clear();

        Operands.push_back(Stores[InstIdx]);

        PrevDist = Dist;

      });


      if (Operands.size() <= 1 ||

          !Visited

               .insert({Operands.front(),

                        cast<StoreInst>(Operands.front())->getValueOperand(),

                        Operands.back(),

                        cast<StoreInst>(Operands.back())->getValueOperand(),

                        Operands.size()})

               .second)

        continue;


      unsigned MaxVecRegSize = R.getMaxVecRegSize();

      unsigned EltSize = R.getVectorElementSize(Operands[0]);

      unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);


      unsigned MaxVF =

          std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);

      auto *Store = cast<StoreInst>(Operands[0]);

      Type *StoreTy = Store->getValueOperand()->getType();

      Type *ValueTy = StoreTy;

      if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

        ValueTy = Trunc->getSrcTy();

      // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But

      // getStoreMinimumVF only support scalar type as arguments. As a result,

      // we need to use the element type of StoreTy and ValueTy to retrieve the

      // VF and then transform it back.

      // Remember: VF is defined as the number we want to vectorize, not the

      // number of elements in the final vector.

      Type *StoreScalarTy = StoreTy->getScalarType();

      unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(

          R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,

          ValueTy->getScalarType()));

      MinVF /= getNumElements(StoreTy);

      MinVF = std::max<unsigned>(2, MinVF);


      if (MaxVF < MinVF) {

        LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF

                          << ") < "

                          << "MinVF (" << MinVF << ")\n");

        continue;

      }


      unsigned NonPowerOf2VF = 0;

      if (VectorizeNonPowerOf2) {

        // First try vectorizing with a non-power-of-2 VF. At the moment, only

        // consider cases where VF + 1 is a power-of-2, i.e. almost all vector

        // lanes are used.

        unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);

        if (has_single_bit(CandVF + 1)) {

          NonPowerOf2VF = CandVF;

          assert(NonPowerOf2VF != MaxVF &&

                 "Non-power-of-2 VF should not be equal to MaxVF");

        }

      }


      // MaxRegVF represents the number of instructions (scalar, or vector in

      // case of revec) that can be vectorized to naturally fit in a vector

      // register.

      unsigned MaxRegVF = MaxVF;


      MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));

      if (MaxVF < MinVF) {

        LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF

                          << ") < "

                          << "MinVF (" << MinVF << ")\n");

        continue;

      }


      SmallVector<unsigned> CandidateVFs;

      for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;

           VF = divideCeil(VF, 2))

        CandidateVFs.push_back(VF);


      unsigned End = Operands.size();

      unsigned Repeat = 0;

      constexpr unsigned MaxAttempts = 4;

      OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());

      for (std::pair<unsigned, unsigned> &P : RangeSizes)

        P.first = P.second = 1;

      DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;

      auto IsNotVectorized = [](bool First,

                                const std::pair<unsigned, unsigned> &P) {

        return First ? P.first > 0 : P.second > 0;

      };

      auto IsVectorized = [](bool First,

                             const std::pair<unsigned, unsigned> &P) {

        return First ? P.first == 0 : P.second == 0;

      };

      auto VFIsProfitable = [](bool First, unsigned Size,

                               const std::pair<unsigned, unsigned> &P) {

        return First ? Size >= P.first : Size >= P.second;

      };

      auto FirstSizeSame = [](unsigned Size,

                              const std::pair<unsigned, unsigned> &P) {

        return Size == P.first;

      };

      while (true) {

        ++Repeat;

        bool RepeatChanged = false;

        bool AnyProfitableGraph = false;

        for (unsigned VF : CandidateVFs) {

          AnyProfitableGraph = false;

          unsigned FirstUnvecStore =

              std::distance(RangeSizes.begin(),

                            find_if(RangeSizes, std::bind(IsNotVectorized,

                                                          VF >= MaxRegVF, _1)));


          // Form slices of size VF starting from FirstUnvecStore and try to

          // vectorize them.

          while (FirstUnvecStore < End) {

            unsigned FirstVecStore = std::distance(

                RangeSizes.begin(),

                find_if(RangeSizes.drop_front(FirstUnvecStore),

                        std::bind(IsVectorized, VF >= MaxRegVF, _1)));

            unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;

            for (unsigned SliceStartIdx = FirstUnvecStore;

                 SliceStartIdx + VF <= MaxSliceEnd;) {

              if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),

                                  VF >= MaxRegVF)) {

                ++SliceStartIdx;

                continue;

              }

              ArrayRef<Value *> Slice =

                  ArrayRef(Operands).slice(SliceStartIdx, VF);

              assert(all_of(Slice,

                            [&](Value *V) {

                              return cast<StoreInst>(V)

                                         ->getValueOperand()

                                         ->getType() ==

                                     cast<StoreInst>(Slice.front())

                                         ->getValueOperand()

                                         ->getType();

                            }) &&

                     "Expected all operands of same type.");

              if (!NonSchedulable.empty()) {

                auto [NonSchedSizeMax, NonSchedSizeMin] =

                    NonSchedulable.lookup(Slice.front());

                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {

                  // VF is too ambitious. Try to vectorize another slice before

                  // trying a smaller VF.

                  SliceStartIdx += NonSchedSizeMax;

                  continue;

                }

              }

              unsigned TreeSize;

              std::optional<bool> Res =

                  vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);

              if (!Res) {

                // Update the range of non schedulable VFs for slices starting

                // at SliceStartIdx.

                NonSchedulable

                    .try_emplace(Slice.front(), std::make_pair(VF, VF))

                    .first->getSecond()

                    .second = VF;

              } else if (*Res) {

                // Mark the vectorized stores so that we don't vectorize them

                // again.

                VectorizedStores.insert_range(Slice);

                // Mark the vectorized stores so that we don't vectorize them

                // again.

                AnyProfitableGraph = RepeatChanged = Changed = true;

                // If we vectorized initial block, no need to try to vectorize

                // it again.

                for (std::pair<unsigned, unsigned> &P :

                     RangeSizes.slice(SliceStartIdx, VF))

                  P.first = P.second = 0;

                if (SliceStartIdx < FirstUnvecStore + MinVF) {

                  for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(

                           FirstUnvecStore, SliceStartIdx - FirstUnvecStore))

                    P.first = P.second = 0;

                  FirstUnvecStore = SliceStartIdx + VF;

                }

                if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {

                  for (std::pair<unsigned, unsigned> &P :

                       RangeSizes.slice(SliceStartIdx + VF,

                                        MaxSliceEnd - (SliceStartIdx + VF)))

                    P.first = P.second = 0;

                  if (MaxSliceEnd == End)

                    End = SliceStartIdx;

                  MaxSliceEnd = SliceStartIdx;

                }

                SliceStartIdx += VF;

                continue;

              }

              if (VF > 2 && Res &&

                  !all_of(RangeSizes.slice(SliceStartIdx, VF),

                          std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,

                                    _1))) {

                SliceStartIdx += VF;

                continue;

              }

              // Check for the very big VFs that we're not rebuilding same

              // trees, just with larger number of elements.

              if (VF > MaxRegVF && TreeSize > 1 &&

                  all_of(RangeSizes.slice(SliceStartIdx, VF),

                         std::bind(FirstSizeSame, TreeSize, _1))) {

                SliceStartIdx += VF;

                while (SliceStartIdx != MaxSliceEnd &&

                       RangeSizes[SliceStartIdx].first == TreeSize)

                  ++SliceStartIdx;

                continue;

              }

              if (TreeSize > 1) {

                for (std::pair<unsigned, unsigned> &P :

                     RangeSizes.slice(SliceStartIdx, VF)) {

                  if (VF >= MaxRegVF)

                    P.second = std::max(P.second, TreeSize);

                  else

                    P.first = std::max(P.first, TreeSize);

                }

              }

              ++SliceStartIdx;

              AnyProfitableGraph = true;

            }

            if (FirstUnvecStore >= End)

              break;

            if (MaxSliceEnd - FirstUnvecStore < VF &&

                MaxSliceEnd - FirstUnvecStore >= MinVF)

              AnyProfitableGraph = true;

            FirstUnvecStore = std::distance(

                RangeSizes.begin(),

                find_if(RangeSizes.drop_front(MaxSliceEnd),

                        std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));

          }

          if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))

            break;

        }

        // All values vectorized - exit.

        if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {

              return P.first == 0 && P.second == 0;

            }))

          break;

        // Check if tried all attempts or no need for the last attempts at all.

        if (Repeat >= MaxAttempts ||

            (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))

          break;

        constexpr unsigned StoresLimit = 64;

        const unsigned MaxTotalNum = std::min<unsigned>(

            Operands.size(),

            static_cast<unsigned>(

                End -

                std::distance(

                    RangeSizes.begin(),

                    find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +

                1));

        unsigned VF = bit_ceil(CandidateVFs.front()) * 2;

        unsigned Limit =

            getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);

        CandidateVFs.clear();

        if (bit_floor(Limit) == VF)

          CandidateVFs.push_back(Limit);

        if (VF > MaxTotalNum || VF >= StoresLimit)

          break;

        for (std::pair<unsigned, unsigned> &P : RangeSizes) {

          if (P.first != 0)

            P.first = std::max(P.second, P.first);

        }

        // Last attempt to vectorize max number of elements, if all previous

        // attempts were unsuccessful because of the cost issues.

        CandidateVFs.push_back(VF);

      }

    }

  };


  /// Groups of stores to vectorize

  SmallVector<RelatedStoreInsts> SortedStores;


  // Inserts the specified store SI with the given index Idx to the set of the

  // stores. If the store with the same distance is found already - stop

  // insertion, try to vectorize already found stores. If some stores from this

  // sequence were not vectorized - try to vectorize them with the new store

  // later. But this logic is applied only to the stores, that come before the

  // previous store with the same distance.

  // Example:

  // 1. store x, %p

  // 2. store y, %p+1

  // 3. store z, %p+2

  // 4. store a, %p

  // 5. store b, %p+3

  // - Scan this from the last to first store. The very first bunch of stores is

  // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores

  // vector).

  // - The next store in the list - #1 - has the same distance from store #5 as

  // the store #4.

  // - Try to vectorize sequence of stores 4,2,3,5.

  // - If all these stores are vectorized - just drop them.

  // - If some of them are not vectorized (say, #3 and #5), do extra analysis.

  // - Start new stores sequence.

  // The new bunch of stores is {1, {1, 0}}.

  // - Add the stores from previous sequence, that were not vectorized.

  // Here we consider the stores in the reversed order, rather they are used in

  // the IR (Stores are reversed already, see vectorizeStoreChains() function).

  // Store #3 can be added -> comes after store #4 with the same distance as

  // store #1.

  // Store #5 cannot be added - comes before store #4.

  // This logic allows to improve the compile time, we assume that the stores

  // after previous store with the same distance most likely have memory

  // dependencies and no need to waste compile time to try to vectorize them.

  // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.

  auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {

    std::optional<int64_t> PtrDist;

    auto *RelatedStores = find_if(

        SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {

          PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);

          return PtrDist.has_value();

        });


    // We did not find a comparable store, start a new group.

    if (RelatedStores == SortedStores.end()) {

      SortedStores.emplace_back(Idx, Stores);

      return;

    }


    // If there is already a store in the group with the same PtrDiff, try to

    // vectorize the existing instructions before adding the current store.

    // Otherwise, insert this store and keep collecting.

    if (std::optional<unsigned> PrevInst =

            RelatedStores->insertOrLookup(Idx, *PtrDist)) {

      TryToVectorize(RelatedStores->getStores());

      RelatedStores->clearVectorizedStores(VectorizedStores);

      RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,

                            /*NewBaseInstIdx=*/Idx,

                            /*DistFromCurBase=*/*PtrDist);

    }

  };

  Type *PrevValTy = nullptr;

  for (auto [I, SI] : enumerate(Stores)) {

    if (R.isDeleted(SI))

      continue;

    if (!PrevValTy)

      PrevValTy = SI->getValueOperand()->getType();

    // Check that we do not try to vectorize stores of different types.

    if (PrevValTy != SI->getValueOperand()->getType()) {

      for (RelatedStoreInsts &StoreSeq : SortedStores)

        TryToVectorize(StoreSeq.getStores());

      SortedStores.clear();

      PrevValTy = SI->getValueOperand()->getType();

    }

    FillStoresSet(I, SI);

  }


  // Final vectorization attempt.

  for (RelatedStoreInsts &StoreSeq : SortedStores)

    TryToVectorize(StoreSeq.getStores());


  return Changed;

}


void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

  // Initialize the collections. We will make a single pass over the block.

  Stores.clear();

  GEPs.clear();


  // Visit the store and getelementptr instructions in BB and organize them in

  // Stores and GEPs according to the underlying objects of their pointer

  // operands.

  for (Instruction &I : *BB) {

    // Ignore store instructions that are volatile or have a pointer operand

    // that doesn't point to a scalar type.

    if (auto *SI = dyn_cast<StoreInst>(&I)) {

      if (!SI->isSimple())

        continue;

      if (!isValidElementType(SI->getValueOperand()->getType()))

        continue;

      Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);

    }


    // Ignore getelementptr instructions that have more than one index, a

    // constant index, or a pointer operand that doesn't point to a scalar

    // type.

    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

      if (GEP->getNumIndices() != 1)

        continue;

      Value *Idx = GEP->idx_begin()->get();

      if (isa<Constant>(Idx))

        continue;

      if (!isValidElementType(Idx->getType()))

        continue;

      if (GEP->getType()->isVectorTy())

        continue;

      GEPs[GEP->getPointerOperand()].push_back(GEP);

    }

  }

}


bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

                                           bool MaxVFOnly) {

  if (VL.size() < 2)

    return false;


  LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "

                    << VL.size() << ".\n");


  // Check that all of the parts are instructions of the same type,

  // we permit an alternate opcode via InstructionsState.

  InstructionsState S = getSameOpcode(VL, *TLI);

  if (!S)

    return false;


  Instruction *I0 = S.getMainOp();

  // Make sure invalid types (including vector type) are rejected before

  // determining vectorization factor for scalar instructions.

  for (Value *V : VL) {

    Type *Ty = V->getType();

    if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {

      // NOTE: the following will give user internal llvm type name, which may

      // not be useful.

      R.getORE()->emit([&]() {

        std::string TypeStr;

        llvm::raw_string_ostream OS(TypeStr);

        Ty->print(OS);

        return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)

               << "Cannot SLP vectorize list: type "

               << TypeStr + " is unsupported by vectorizer";

      });

      return false;

    }

  }


  Type *ScalarTy = getValueType(VL[0]);

  unsigned Sz = R.getVectorElementSize(I0);

  unsigned MinVF = R.getMinVF(Sz);

  unsigned MaxVF = std::max<unsigned>(

      getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);

  MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);

  if (MaxVF < 2) {

    R.getORE()->emit([&]() {

      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)

             << "Cannot SLP vectorize list: vectorization factor "

             << "less than 2 is not supported";

    });

    return false;

  }


  bool Changed = false;

  bool CandidateFound = false;

  InstructionCost MinCost = SLPCostThreshold.getValue();


  unsigned NextInst = 0, MaxInst = VL.size();

  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;

       VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {

    // No actual vectorization should happen, if number of parts is the same as

    // provided vectorization factor (i.e. the scalar type is used for vector

    // code during codegen).

    auto *VecTy = getWidenedType(ScalarTy, VF);

    if (TTI->getNumberOfParts(VecTy) == VF)

      continue;

    for (unsigned I = NextInst; I < MaxInst; ++I) {

      unsigned ActualVF = std::min(MaxInst - I, VF);


      if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))

        continue;


      if (MaxVFOnly && ActualVF < MaxVF)

        break;

      if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))

        break;


      SmallVector<Value *> Ops(ActualVF, nullptr);

      unsigned Idx = 0;

      for (Value *V : VL.drop_front(I)) {

        // Check that a previous iteration of this loop did not delete the

        // Value.

        if (auto *Inst = dyn_cast<Instruction>(V);

            !Inst || !R.isDeleted(Inst)) {

          Ops[Idx] = V;

          ++Idx;

          if (Idx == ActualVF)

            break;

        }

      }

      // Not enough vectorizable instructions - exit.

      if (Idx != ActualVF)

        break;


      LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "

                        << "\n");


      R.buildTree(Ops);

      if (R.isTreeTinyAndNotFullyVectorizable())

        continue;

      if (R.isProfitableToReorder()) {

        R.reorderTopToBottom();

        R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));

      }

      R.transformNodes();

      R.buildExternalUses();


      R.computeMinimumValueSizes();

      InstructionCost Cost = R.getTreeCost();

      CandidateFound = true;

      MinCost = std::min(MinCost, Cost);


      LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost

                        << " for VF=" << ActualVF << "\n");

      if (Cost < -SLPCostThreshold) {

        LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");

        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",

                                                    cast<Instruction>(Ops[0]))

                                 << "SLP vectorized with cost " << ore::NV("Cost", Cost)

                                 << " and with tree size "

                                 << ore::NV("TreeSize", R.getTreeSize()));


        R.vectorizeTree();

        // Move to the next bundle.

        I += VF - 1;

        NextInst = I + 1;

        Changed = true;

      }

    }

  }


  if (!Changed && CandidateFound) {

    R.getORE()->emit([&]() {

      return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)

             << "List vectorization was possible but not beneficial with cost "

             << ore::NV("Cost", MinCost) << " >= "

             << ore::NV("Treshold", -SLPCostThreshold);

    });

  } else if (!Changed) {

    R.getORE()->emit([&]() {

      return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)

             << "Cannot SLP vectorize list: vectorization was impossible"

             << " with available vectorization factors";

    });

  }

  return Changed;

}


namespace {


/// Model horizontal reductions.

///

/// A horizontal reduction is a tree of reduction instructions that has values

/// that can be put into a vector as its leaves. For example:

///

/// mul mul mul mul

///  \  /    \  /

///   +       +

///    \     /

///       +

/// This tree has "mul" as its leaf values and "+" as its reduction

/// instructions. A reduction can feed into a store or a binary operation

/// feeding a phi.

///    ...

///    \  /

///     +

///     |

///  phi +=

///

///  Or:

///    ...

///    \  /

///     +

///     |

///   *p =

///

class HorizontalReduction {

  using ReductionOpsType = SmallVector<Value *, 16>;

  using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;

  ReductionOpsListType ReductionOps;

  /// List of possibly reduced values.

  SmallVector<SmallVector<Value *>> ReducedVals;

  /// Maps reduced value to the corresponding reduction operation.

  SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;

  WeakTrackingVH ReductionRoot;

  /// The type of reduction operation.

  RecurKind RdxKind;

  /// Checks if the optimization of original scalar identity operations on

  /// matched horizontal reductions is enabled and allowed.

  bool IsSupportedHorRdxIdentityOp = false;

  /// The minimum number of the reduced values.

  const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;

  /// Contains vector values for reduction including their scale factor and

  /// signedness.

  SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;


  static bool isCmpSelMinMax(Instruction *I) {

    return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&

           RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));

  }


  // And/or are potentially poison-safe logical patterns like:

  // select x, y, false

  // select x, true, y

  static bool isBoolLogicOp(Instruction *I) {

    return isa<SelectInst>(I) &&

           (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));

  }


  /// Checks if instruction is associative and can be vectorized.

  static bool isVectorizable(RecurKind Kind, Instruction *I,

                             bool TwoElementReduction = false) {

    if (Kind == RecurKind::None)

      return false;


    // Integer ops that map to select instructions or intrinsics are fine.

    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||

        isBoolLogicOp(I))

      return true;


    // No need to check for associativity, if 2 reduced values.

    if (TwoElementReduction)

      return true;


    if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {

      // FP min/max are associative except for NaN and -0.0. We do not

      // have to rule out -0.0 here because the intrinsic semantics do not

      // specify a fixed result for it.

      return I->getFastMathFlags().noNaNs();

    }


    if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)

      return true;


    return I->isAssociative();

  }


  static Value *getRdxOperand(Instruction *I, unsigned Index) {

    // Poison-safe 'or' takes the form: select X, true, Y

    // To make that work with the normal operand processing, we skip the

    // true value operand.

    // TODO: Change the code and data structures to handle this without a hack.

    if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)

      return I->getOperand(2);

    return I->getOperand(Index);

  }


  /// Creates reduction operation with the current opcode.

  static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,

                         Value *RHS, const Twine &Name, bool UseSelect) {

    Type *OpTy = LHS->getType();

    assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");

    switch (Kind) {

    case RecurKind::Or: {

      if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))

        return Builder.CreateSelect(

            LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),

            RHS, Name);

      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);

      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

                                 Name);

    }

    case RecurKind::And: {

      if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))

        return Builder.CreateSelect(

            LHS, RHS,

            ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);

      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);

      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

                                 Name);

    }

    case RecurKind::Add:

    case RecurKind::Mul:

    case RecurKind::Xor:

    case RecurKind::FAdd:

    case RecurKind::FMul: {

      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);

      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

                                 Name);

    }

    case RecurKind::SMax:

    case RecurKind::SMin:

    case RecurKind::UMax:

    case RecurKind::UMin:

      if (UseSelect) {

        CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);

        Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);

        return Builder.CreateSelect(Cmp, LHS, RHS, Name);

      }

      [[fallthrough]];

    case RecurKind::FMax:

    case RecurKind::FMin:

    case RecurKind::FMaximum:

    case RecurKind::FMinimum:

    case RecurKind::FMaximumNum:

    case RecurKind::FMinimumNum: {

      Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind);

      return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);

    }

    default:

      llvm_unreachable("Unknown reduction operation.");

    }

  }


  /// Creates reduction operation with the current opcode with the IR flags

  /// from \p ReductionOps, dropping nuw/nsw flags.

  static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,

                         Value *RHS, const Twine &Name,

                         const ReductionOpsListType &ReductionOps) {

    bool UseSelect = ReductionOps.size() == 2 ||

                     // Logical or/and.

                     (ReductionOps.size() == 1 &&

                      any_of(ReductionOps.front(), IsaPred<SelectInst>));

    assert((!UseSelect || ReductionOps.size() != 2 ||

            isa<SelectInst>(ReductionOps[1][0])) &&

           "Expected cmp + select pairs for reduction");

    Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);

    if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {

      if (auto *Sel = dyn_cast<SelectInst>(Op)) {

        propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,

                         /*IncludeWrapFlags=*/false);

        propagateIRFlags(Op, ReductionOps[1], nullptr,

                         /*IncludeWrapFlags=*/false);

        return Op;

      }

    }

    propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);

    return Op;

  }


public:

  static RecurKind getRdxKind(Value *V) {

    auto *I = dyn_cast<Instruction>(V);

    if (!I)

      return RecurKind::None;

    if (match(I, m_Add(m_Value(), m_Value())))

      return RecurKind::Add;

    if (match(I, m_Mul(m_Value(), m_Value())))

      return RecurKind::Mul;

    if (match(I, m_And(m_Value(), m_Value())) ||

        match(I, m_LogicalAnd(m_Value(), m_Value())))

      return RecurKind::And;

    if (match(I, m_Or(m_Value(), m_Value())) ||

        match(I, m_LogicalOr(m_Value(), m_Value())))

      return RecurKind::Or;

    if (match(I, m_Xor(m_Value(), m_Value())))

      return RecurKind::Xor;

    if (match(I, m_FAdd(m_Value(), m_Value())))

      return RecurKind::FAdd;

    if (match(I, m_FMul(m_Value(), m_Value())))

      return RecurKind::FMul;


    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))

      return RecurKind::FMax;

    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))

      return RecurKind::FMin;


    if (match(I, m_FMaximum(m_Value(), m_Value())))

      return RecurKind::FMaximum;

    if (match(I, m_FMinimum(m_Value(), m_Value())))

      return RecurKind::FMinimum;

    // This matches either cmp+select or intrinsics. SLP is expected to handle

    // either form.

    // TODO: If we are canonicalizing to intrinsics, we can remove several

    //       special-case paths that deal with selects.

    if (match(I, m_SMax(m_Value(), m_Value())))

      return RecurKind::SMax;

    if (match(I, m_SMin(m_Value(), m_Value())))

      return RecurKind::SMin;

    if (match(I, m_UMax(m_Value(), m_Value())))

      return RecurKind::UMax;

    if (match(I, m_UMin(m_Value(), m_Value())))

      return RecurKind::UMin;


    if (auto *Select = dyn_cast<SelectInst>(I)) {

      // Try harder: look for min/max pattern based on instructions producing

      // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).

      // During the intermediate stages of SLP, it's very common to have

      // pattern like this (since optimizeGatherSequence is run only once

      // at the end):

      // %1 = extractelement <2 x i32> %a, i32 0

      // %2 = extractelement <2 x i32> %a, i32 1

      // %cond = icmp sgt i32 %1, %2

      // %3 = extractelement <2 x i32> %a, i32 0

      // %4 = extractelement <2 x i32> %a, i32 1

      // %select = select i1 %cond, i32 %3, i32 %4

      CmpPredicate Pred;

      Instruction *L1;

      Instruction *L2;


      Value *LHS = Select->getTrueValue();

      Value *RHS = Select->getFalseValue();

      Value *Cond = Select->getCondition();


      // TODO: Support inverse predicates.

      if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {

        if (!isa<ExtractElementInst>(RHS) ||

            !L2->isIdenticalTo(cast<Instruction>(RHS)))

          return RecurKind::None;

      } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {

        if (!isa<ExtractElementInst>(LHS) ||

            !L1->isIdenticalTo(cast<Instruction>(LHS)))

          return RecurKind::None;

      } else {

        if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))

          return RecurKind::None;

        if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||

            !L1->isIdenticalTo(cast<Instruction>(LHS)) ||

            !L2->isIdenticalTo(cast<Instruction>(RHS)))

          return RecurKind::None;

      }


      switch (Pred) {

      default:

        return RecurKind::None;

      case CmpInst::ICMP_SGT:

      case CmpInst::ICMP_SGE:

        return RecurKind::SMax;

      case CmpInst::ICMP_SLT:

      case CmpInst::ICMP_SLE:

        return RecurKind::SMin;

      case CmpInst::ICMP_UGT:

      case CmpInst::ICMP_UGE:

        return RecurKind::UMax;

      case CmpInst::ICMP_ULT:

      case CmpInst::ICMP_ULE:

        return RecurKind::UMin;

      }

    }

    return RecurKind::None;

  }


  /// Get the index of the first operand.

  static unsigned getFirstOperandIndex(Instruction *I) {

    return isCmpSelMinMax(I) ? 1 : 0;

  }


private:

  /// Total number of operands in the reduction operation.

  static unsigned getNumberOfOperands(Instruction *I) {

    return isCmpSelMinMax(I) ? 3 : 2;

  }


  /// Checks if the instruction is in basic block \p BB.

  /// For a cmp+sel min/max reduction check that both ops are in \p BB.

  static bool hasSameParent(Instruction *I, BasicBlock *BB) {

    if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {

      auto *Sel = cast<SelectInst>(I);

      auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());

      return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;

    }

    return I->getParent() == BB;

  }


  /// Expected number of uses for reduction operations/reduced values.

  static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {

    if (IsCmpSelMinMax) {

      // SelectInst must be used twice while the condition op must have single

      // use only.

      if (auto *Sel = dyn_cast<SelectInst>(I))

        return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();

      return I->hasNUses(2);

    }


    // Arithmetic reduction operation must be used once only.

    return I->hasOneUse();

  }


  /// Initializes the list of reduction operations.

  void initReductionOps(Instruction *I) {

    if (isCmpSelMinMax(I))

      ReductionOps.assign(2, ReductionOpsType());

    else

      ReductionOps.assign(1, ReductionOpsType());

  }


  /// Add all reduction operations for the reduction instruction \p I.

  void addReductionOps(Instruction *I) {

    if (isCmpSelMinMax(I)) {

      ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());

      ReductionOps[1].emplace_back(I);

    } else {

      ReductionOps[0].emplace_back(I);

    }

  }


  static bool isGoodForReduction(ArrayRef<Value *> Data) {

    int Sz = Data.size();

    auto *I = dyn_cast<Instruction>(Data.front());

    return Sz > 1 || isConstant(Data.front()) ||

           (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));

  }


public:

  HorizontalReduction() = default;

  HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)

      : ReductionRoot(I), ReductionLimit(2) {

    RdxKind = HorizontalReduction::getRdxKind(I);

    ReductionOps.emplace_back().push_back(I);

    ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());

    for (Value *V : Ops)

      ReducedValsToOps[V].push_back(I);

  }


  bool matchReductionForOperands() const {

    // Analyze "regular" integer/FP types for reductions - no target-specific

    // types or pointers.

    assert(ReductionRoot && "Reduction root is not set!");

    if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),

                        all_of(ReducedVals, [](ArrayRef<Value *> Ops) {

                          return Ops.size() == 2;

                        })))

      return false;


    return true;

  }


  /// Try to find a reduction tree.

  bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,

                                 ScalarEvolution &SE, const DataLayout &DL,

                                 const TargetLibraryInfo &TLI) {

    RdxKind = HorizontalReduction::getRdxKind(Root);

    if (!isVectorizable(RdxKind, Root))

      return false;


    // Analyze "regular" integer/FP types for reductions - no target-specific

    // types or pointers.

    Type *Ty = Root->getType();

    if (!isValidElementType(Ty) || Ty->isPointerTy())

      return false;


    // Though the ultimate reduction may have multiple uses, its condition must

    // have only single use.

    if (auto *Sel = dyn_cast<SelectInst>(Root))

      if (!Sel->getCondition()->hasOneUse())

        return false;


    ReductionRoot = Root;


    // Iterate through all the operands of the possible reduction tree and

    // gather all the reduced values, sorting them by their value id.

    BasicBlock *BB = Root->getParent();

    bool IsCmpSelMinMax = isCmpSelMinMax(Root);

    SmallVector<std::pair<Instruction *, unsigned>> Worklist(

        1, std::make_pair(Root, 0));

    // Checks if the operands of the \p TreeN instruction are also reduction

    // operations or should be treated as reduced values or an extra argument,

    // which is not part of the reduction.

    auto CheckOperands = [&](Instruction *TreeN,

                             SmallVectorImpl<Value *> &PossibleReducedVals,

                             SmallVectorImpl<Instruction *> &ReductionOps,

                             unsigned Level) {

      for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),

                                    getNumberOfOperands(TreeN)))) {

        Value *EdgeVal = getRdxOperand(TreeN, I);

        ReducedValsToOps[EdgeVal].push_back(TreeN);

        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);

        // If the edge is not an instruction, or it is different from the main

        // reduction opcode or has too many uses - possible reduced value.

        // Also, do not try to reduce const values, if the operation is not

        // foldable.

        if (!EdgeInst || Level > RecursionMaxDepth ||

            getRdxKind(EdgeInst) != RdxKind ||

            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||

            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||

            !isVectorizable(RdxKind, EdgeInst) ||

            (R.isAnalyzedReductionRoot(EdgeInst) &&

             all_of(EdgeInst->operands(), IsaPred<Constant>))) {

          PossibleReducedVals.push_back(EdgeVal);

          continue;

        }

        ReductionOps.push_back(EdgeInst);

      }

    };

    // Try to regroup reduced values so that it gets more profitable to try to

    // reduce them. Values are grouped by their value ids, instructions - by

    // instruction op id and/or alternate op id, plus do extra analysis for

    // loads (grouping them by the distance between pointers) and cmp

    // instructions (grouping them by the predicate).

    SmallMapVector<

        size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,

        8>

        PossibleReducedVals;

    initReductionOps(Root);

    DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;

    SmallSet<size_t, 2> LoadKeyUsed;


    auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {

      Key = hash_combine(hash_value(LI->getParent()), Key);

      Value *Ptr =

          getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);

      if (!LoadKeyUsed.insert(Key).second) {

        auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));

        if (LIt != LoadsMap.end()) {

          for (LoadInst *RLI : LIt->second) {

            if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

                                LI->getType(), LI->getPointerOperand(), DL, SE,

                                /*StrictCheck=*/true))

              return hash_value(RLI->getPointerOperand());

          }

          for (LoadInst *RLI : LIt->second) {

            if (arePointersCompatible(RLI->getPointerOperand(),

                                      LI->getPointerOperand(), TLI)) {

              hash_code SubKey = hash_value(RLI->getPointerOperand());

              return SubKey;

            }

          }

          if (LIt->second.size() > 2) {

            hash_code SubKey =

                hash_value(LIt->second.back()->getPointerOperand());

            return SubKey;

          }

        }

      }

      LoadsMap.try_emplace(std::make_pair(Key, Ptr))

          .first->second.push_back(LI);

      return hash_value(LI->getPointerOperand());

    };


    while (!Worklist.empty()) {

      auto [TreeN, Level] = Worklist.pop_back_val();

      SmallVector<Value *> PossibleRedVals;

      SmallVector<Instruction *> PossibleReductionOps;

      CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);

      addReductionOps(TreeN);

      // Add reduction values. The values are sorted for better vectorization

      // results.

      for (Value *V : PossibleRedVals) {

        size_t Key, Idx;

        std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,

                                               /*AllowAlternate=*/false);

        ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;

      }

      for (Instruction *I : reverse(PossibleReductionOps))

        Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);

    }

    auto PossibleReducedValsVect = PossibleReducedVals.takeVector();

    // Sort values by the total number of values kinds to start the reduction

    // from the longest possible reduced values sequences.

    for (auto &PossibleReducedVals : PossibleReducedValsVect) {

      auto PossibleRedVals = PossibleReducedVals.second.takeVector();

      SmallVector<SmallVector<Value *>> PossibleRedValsVect;

      for (auto &Slice : PossibleRedVals) {

        PossibleRedValsVect.emplace_back();

        auto RedValsVect = Slice.second.takeVector();

        stable_sort(RedValsVect, llvm::less_second());

        for (const std::pair<Value *, unsigned> &Data : RedValsVect)

          PossibleRedValsVect.back().append(Data.second, Data.first);

      }

      stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {

        return P1.size() > P2.size();

      });

      int NewIdx = -1;

      for (ArrayRef<Value *> Data : PossibleRedValsVect) {

        if (NewIdx < 0 ||

            (!isGoodForReduction(Data) &&

             (!isa<LoadInst>(Data.front()) ||

              !isa<LoadInst>(ReducedVals[NewIdx].front()) ||

              getUnderlyingObject(

                  cast<LoadInst>(Data.front())->getPointerOperand()) !=

                  getUnderlyingObject(

                      cast<LoadInst>(ReducedVals[NewIdx].front())

                          ->getPointerOperand())))) {

          NewIdx = ReducedVals.size();

          ReducedVals.emplace_back();

        }

        ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());

      }

    }

    // Sort the reduced values by number of same/alternate opcode and/or pointer

    // operand.

    stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {

      return P1.size() > P2.size();

    });

    return true;

  }


  /// Attempt to vectorize the tree found by matchAssociativeReduction.

  Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,

                     const TargetLibraryInfo &TLI, AssumptionCache *AC) {

    constexpr unsigned RegMaxNumber = 4;

    constexpr unsigned RedValsMaxNumber = 128;

    // If there are a sufficient number of reduction values, reduce

    // to a nearby power-of-2. We can safely generate oversized

    // vectors and rely on the backend to split them to legal sizes.

    if (unsigned NumReducedVals = std::accumulate(

            ReducedVals.begin(), ReducedVals.end(), 0,

            [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {

              if (!isGoodForReduction(Vals))

                return Num;

              return Num + Vals.size();

            });

        NumReducedVals < ReductionLimit &&

        all_of(ReducedVals, [](ArrayRef<Value *> RedV) {

           return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);

         })) {

      for (ReductionOpsType &RdxOps : ReductionOps)

        for (Value *RdxOp : RdxOps)

          V.analyzedReductionRoot(cast<Instruction>(RdxOp));

      return nullptr;

    }


    IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),

                                    TargetFolder(DL));

    Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));


    // Track the reduced values in case if they are replaced by extractelement

    // because of the vectorization.

    DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *

                                                  ReducedVals.front().size());


    // The compare instruction of a min/max is the insertion point for new

    // instructions and may be replaced with a new compare instruction.

    auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

      assert(isa<SelectInst>(RdxRootInst) &&

             "Expected min/max reduction to have select root instruction");

      Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();

      assert(isa<Instruction>(ScalarCond) &&

             "Expected min/max reduction to have compare condition");

      return cast<Instruction>(ScalarCond);

    };


    bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {

      return isBoolLogicOp(cast<Instruction>(V));

    });

    // Return new VectorizedTree, based on previous value.

    auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {

      if (VectorizedTree) {

        // Update the final value in the reduction.

        Builder.SetCurrentDebugLocation(

            cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

        if (AnyBoolLogicOp) {

          auto It = ReducedValsToOps.find(VectorizedTree);

          auto It1 = ReducedValsToOps.find(Res);

          if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||

              isGuaranteedNotToBePoison(VectorizedTree, AC) ||

              (It != ReducedValsToOps.end() &&

               any_of(It->getSecond(), [&](Instruction *I) {

                 return isBoolLogicOp(I) &&

                        getRdxOperand(I, 0) == VectorizedTree;

               }))) {

            ;

          } else if (isGuaranteedNotToBePoison(Res, AC) ||

                     (It1 != ReducedValsToOps.end() &&

                     any_of(It1->getSecond(), [&](Instruction *I) {

                       return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;

                     }))) {

            std::swap(VectorizedTree, Res);

          } else {

            VectorizedTree = Builder.CreateFreeze(VectorizedTree);

          }

        }


        return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",

                        ReductionOps);

      }

      // Initialize the final value in the reduction.

      return Res;

    };

    SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *

                                      ReductionOps.front().size());

    for (ReductionOpsType &RdxOps : ReductionOps)

      for (Value *RdxOp : RdxOps) {

        if (!RdxOp)

          continue;

        IgnoreList.insert(RdxOp);

      }

    // Intersect the fast-math-flags from all reduction operations.

    FastMathFlags RdxFMF;

    RdxFMF.set();

    for (Value *U : IgnoreList)

      if (auto *FPMO = dyn_cast<FPMathOperator>(U))

        RdxFMF &= FPMO->getFastMathFlags();

    bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));


    // Need to track reduced vals, they may be changed during vectorization of

    // subvectors.

    for (ArrayRef<Value *> Candidates : ReducedVals)

      for (Value *V : Candidates)

        TrackedVals.try_emplace(V, V);


    auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,

                 Value *V) -> unsigned & {

      auto *It = MV.find(V);

      assert(It != MV.end() && "Unable to find given key.");

      return It->second;

    };


    DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());

    // List of the values that were reduced in other trees as part of gather

    // nodes and thus requiring extract if fully vectorized in other trees.

    SmallPtrSet<Value *, 4> RequiredExtract;

    WeakTrackingVH VectorizedTree = nullptr;

    bool CheckForReusedReductionOps = false;

    // Try to vectorize elements based on their type.

    SmallVector<InstructionsState> States;

    for (ArrayRef<Value *> RV : ReducedVals)

      States.push_back(getSameOpcode(RV, TLI));

    for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {

      ArrayRef<Value *> OrigReducedVals = ReducedVals[I];

      InstructionsState S = States[I];

      SmallVector<Value *> Candidates;

      Candidates.reserve(2 * OrigReducedVals.size());

      DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());

      for (Value *ReducedVal : OrigReducedVals) {

        Value *RdxVal = TrackedVals.at(ReducedVal);

        // Check if the reduction value was not overriden by the extractelement

        // instruction because of the vectorization and exclude it, if it is not

        // compatible with other values.

        // Also check if the instruction was folded to constant/other value.

        auto *Inst = dyn_cast<Instruction>(RdxVal);

        if ((Inst && isVectorLikeInstWithConstOps(Inst) &&

             (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||

            (S && !Inst))

          continue;

        Candidates.push_back(RdxVal);

        TrackedToOrig.try_emplace(RdxVal, ReducedVal);

      }

      bool ShuffledExtracts = false;

      // Try to handle shuffled extractelements.

      if (S && S.getOpcode() == Instruction::ExtractElement &&

          !S.isAltShuffle() && I + 1 < E) {

        SmallVector<Value *> CommonCandidates(Candidates);

        for (Value *RV : ReducedVals[I + 1]) {

          Value *RdxVal = TrackedVals.at(RV);

          // Check if the reduction value was not overriden by the

          // extractelement instruction because of the vectorization and

          // exclude it, if it is not compatible with other values.

          auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);

          if (!Inst)

            continue;

          CommonCandidates.push_back(RdxVal);

          TrackedToOrig.try_emplace(RdxVal, RV);

        }

        SmallVector<int> Mask;

        if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {

          ++I;

          Candidates.swap(CommonCandidates);

          ShuffledExtracts = true;

        }

      }


      // Emit code for constant values.

      if (Candidates.size() > 1 && allConstant(Candidates)) {

        Value *Res = Candidates.front();

        Value *OrigV = TrackedToOrig.at(Candidates.front());

        ++VectorizedVals.try_emplace(OrigV).first->getSecond();

        for (Value *VC : ArrayRef(Candidates).drop_front()) {

          Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);

          Value *OrigV = TrackedToOrig.at(VC);

          ++VectorizedVals.try_emplace(OrigV).first->getSecond();

          if (auto *ResI = dyn_cast<Instruction>(Res))

            V.analyzedReductionRoot(ResI);

        }

        VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);

        continue;

      }


      unsigned NumReducedVals = Candidates.size();

      if (NumReducedVals < ReductionLimit &&

          (NumReducedVals < 2 || !isSplat(Candidates)))

        continue;


      // Check if we support repeated scalar values processing (optimization of

      // original scalar identity operations on matched horizontal reductions).

      IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&

                                    RdxKind != RecurKind::FMul &&

                                    RdxKind != RecurKind::FMulAdd;

      // Gather same values.

      SmallMapVector<Value *, unsigned, 16> SameValuesCounter;

      if (IsSupportedHorRdxIdentityOp)

        for (Value *V : Candidates) {

          Value *OrigV = TrackedToOrig.at(V);

          ++SameValuesCounter.try_emplace(OrigV).first->second;

        }

      // Used to check if the reduced values used same number of times. In this

      // case the compiler may produce better code. E.g. if reduced values are

      // aabbccdd (8 x values), then the first node of the tree will have a node

      // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.

      // Plus, the final reduction will be performed on <8 x aabbccdd>.

      // Instead compiler may build <4 x abcd> tree immediately, + reduction (4

      // x abcd) * 2.

      // Currently it only handles add/fadd/xor. and/or/min/max do not require

      // this analysis, other operations may require an extra estimation of

      // the profitability.

      bool SameScaleFactor = false;

      bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&

                              SameValuesCounter.size() != Candidates.size();

      BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

      if (OptReusedScalars) {

        SameScaleFactor =

            (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||

             RdxKind == RecurKind::Xor) &&

            all_of(drop_begin(SameValuesCounter),

                   [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {

                     return P.second == SameValuesCounter.front().second;

                   });

        Candidates.resize(SameValuesCounter.size());

        transform(SameValuesCounter, Candidates.begin(),

                  [&](const auto &P) { return TrackedVals.at(P.first); });

        NumReducedVals = Candidates.size();

        // Have a reduction of the same element.

        if (NumReducedVals == 1) {

          Value *OrigV = TrackedToOrig.at(Candidates.front());

          unsigned Cnt = At(SameValuesCounter, OrigV);

          Value *RedVal =

              emitScaleForReusedOps(Candidates.front(), Builder, Cnt);

          VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

          VectorizedVals.try_emplace(OrigV, Cnt);

          ExternallyUsedValues.insert(OrigV);

          continue;

        }

      }


      unsigned MaxVecRegSize = V.getMaxVecRegSize();

      unsigned EltSize = V.getVectorElementSize(Candidates[0]);

      const unsigned MaxElts = std::clamp<unsigned>(

          llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,

          RegMaxNumber * RedValsMaxNumber);


      unsigned ReduxWidth = NumReducedVals;

      auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {

        unsigned NumParts, NumRegs;

        Type *ScalarTy = Candidates.front()->getType();

        ReduxWidth =

            getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);

        VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);

        NumParts = ::getNumberOfParts(TTI, Tp);

        NumRegs =

            TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));

        while (NumParts > NumRegs) {

          assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");

          ReduxWidth = bit_floor(ReduxWidth - 1);

          VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);

          NumParts = ::getNumberOfParts(TTI, Tp);

          NumRegs =

              TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));

        }

        if (NumParts > NumRegs / 2)

          ReduxWidth = bit_floor(ReduxWidth);

        return ReduxWidth;

      };

      if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))

        ReduxWidth = GetVectorFactor(ReduxWidth);

      ReduxWidth = std::min(ReduxWidth, MaxElts);


      unsigned Start = 0;

      unsigned Pos = Start;

      // Restarts vectorization attempt with lower vector factor.

      unsigned PrevReduxWidth = ReduxWidth;

      bool CheckForReusedReductionOpsLocal = false;

      auto AdjustReducedVals = [&](bool IgnoreVL = false) {

        bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);

        if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {

          // Check if any of the reduction ops are gathered. If so, worth

          // trying again with less number of reduction ops.

          CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;

        }

        ++Pos;

        if (Pos < NumReducedVals - ReduxWidth + 1)

          return IsAnyRedOpGathered;

        Pos = Start;

        --ReduxWidth;

        if (ReduxWidth > 1)

          ReduxWidth = GetVectorFactor(ReduxWidth);

        return IsAnyRedOpGathered;

      };

      bool AnyVectorized = false;

      SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;

      while (Pos < NumReducedVals - ReduxWidth + 1 &&

             ReduxWidth >= ReductionLimit) {

        // Dependency in tree of the reduction ops - drop this attempt, try

        // later.

        if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&

            Start == 0) {

          CheckForReusedReductionOps = true;

          break;

        }

        PrevReduxWidth = ReduxWidth;

        ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);

        // Been analyzed already - skip.

        if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||

            (!has_single_bit(ReduxWidth) &&

             (IgnoredCandidates.contains(

                  std::make_pair(Pos, bit_floor(ReduxWidth))) ||

              IgnoredCandidates.contains(

                  std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),

                                 bit_floor(ReduxWidth))))) ||

            V.areAnalyzedReductionVals(VL)) {

          (void)AdjustReducedVals(/*IgnoreVL=*/true);

          continue;

        }

        // Early exit if any of the reduction values were deleted during

        // previous vectorization attempts.

        if (any_of(VL, [&V](Value *RedVal) {

              auto *RedValI = dyn_cast<Instruction>(RedVal);

              if (!RedValI)

                return false;

              return V.isDeleted(RedValI);

            }))

          break;

        V.buildTree(VL, IgnoreList);

        if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {

          if (!AdjustReducedVals())

            V.analyzedReductionVals(VL);

          continue;

        }

        if (V.isLoadCombineReductionCandidate(RdxKind)) {

          if (!AdjustReducedVals())

            V.analyzedReductionVals(VL);

          continue;

        }

        V.reorderTopToBottom();

        // No need to reorder the root node at all for reassociative reduction.

        V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||

                             VL.front()->getType()->isIntOrIntVectorTy() ||

                             ReductionLimit > 2);

        // Keep extracted other reduction values, if they are used in the

        // vectorization trees.

        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(

            ExternallyUsedValues);

        // The reduction root is used as the insertion point for new

        // instructions, so set it as externally used to prevent it from being

        // deleted.

        LocalExternallyUsedValues.insert(ReductionRoot);

        for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {

          if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))

            continue;

          for (Value *V : ReducedVals[Cnt])

            if (isa<Instruction>(V))

              LocalExternallyUsedValues.insert(TrackedVals[V]);

        }

        if (!IsSupportedHorRdxIdentityOp) {

          // Number of uses of the candidates in the vector of values.

          assert(SameValuesCounter.empty() &&

                 "Reused values counter map is not empty");

          for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

            if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

              continue;

            Value *V = Candidates[Cnt];

            Value *OrigV = TrackedToOrig.at(V);

            ++SameValuesCounter.try_emplace(OrigV).first->second;

          }

        }

        V.transformNodes();

        SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);

        // Gather externally used values.

        SmallPtrSet<Value *, 4> Visited;

        for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

          if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

            continue;

          Value *RdxVal = Candidates[Cnt];

          if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())

            RdxVal = It->second;

          if (!Visited.insert(RdxVal).second)

            continue;

          // Check if the scalar was vectorized as part of the vectorization

          // tree but not the top node.

          if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {

            LocalExternallyUsedValues.insert(RdxVal);

            continue;

          }

          Value *OrigV = TrackedToOrig.at(RdxVal);

          unsigned NumOps =

              VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);

          if (NumOps != ReducedValsToOps.at(OrigV).size())

            LocalExternallyUsedValues.insert(RdxVal);

        }

        // Do not need the list of reused scalars in regular mode anymore.

        if (!IsSupportedHorRdxIdentityOp)

          SameValuesCounter.clear();

        for (Value *RdxVal : VL)

          if (RequiredExtract.contains(RdxVal))

            LocalExternallyUsedValues.insert(RdxVal);

        V.buildExternalUses(LocalExternallyUsedValues);


        V.computeMinimumValueSizes();


        // Estimate cost.

        InstructionCost ReductionCost =

            getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);

        InstructionCost Cost = V.getTreeCost(VL, ReductionCost);

        LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost

                          << " for reduction\n");

        if (!Cost.isValid())

          break;

        if (Cost >= -SLPCostThreshold) {

          V.getORE()->emit([&]() {

            return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",

                                            ReducedValsToOps.at(VL[0]).front())

                   << "Vectorizing horizontal reduction is possible "

                   << "but not beneficial with cost " << ore::NV("Cost", Cost)

                   << " and threshold "

                   << ore::NV("Threshold", -SLPCostThreshold);

          });

          if (!AdjustReducedVals()) {

            V.analyzedReductionVals(VL);

            unsigned Offset = Pos == Start ? Pos : Pos - 1;

            if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {

              // Add subvectors of VL to the list of the analyzed values.

              for (unsigned VF = getFloorFullVectorNumberOfElements(

                       *TTI, VL.front()->getType(), ReduxWidth - 1);

                   VF >= ReductionLimit;

                   VF = getFloorFullVectorNumberOfElements(

                       *TTI, VL.front()->getType(), VF - 1)) {

                if (has_single_bit(VF) &&

                    V.getCanonicalGraphSize() != V.getTreeSize())

                  continue;

                for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))

                  IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));

              }

            }

          }

          continue;

        }


        LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"

                          << Cost << ". (HorRdx)\n");

        V.getORE()->emit([&]() {

          return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",

                                    ReducedValsToOps.at(VL[0]).front())

                 << "Vectorized horizontal reduction with cost "

                 << ore::NV("Cost", Cost) << " and with tree size "

                 << ore::NV("TreeSize", V.getTreeSize());

        });


        Builder.setFastMathFlags(RdxFMF);


        // Emit a reduction. If the root is a select (min/max idiom), the insert

        // point is the compare condition of that select.

        Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

        Instruction *InsertPt = RdxRootInst;

        if (IsCmpSelMinMax)

          InsertPt = GetCmpForMinMaxReduction(RdxRootInst);


        // Vectorize a tree.

        Value *VectorizedRoot = V.vectorizeTree(

            LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);

        // Update TrackedToOrig mapping, since the tracked values might be

        // updated.

        for (Value *RdxVal : Candidates) {

          Value *OrigVal = TrackedToOrig.at(RdxVal);

          Value *TransformedRdxVal = TrackedVals.at(OrigVal);

          if (TransformedRdxVal != RdxVal)

            TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);

        }


        Builder.SetInsertPoint(InsertPt);


        // To prevent poison from leaking across what used to be sequential,

        // safe, scalar boolean logic operations, the reduction operand must be

        // frozen.

        if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))

          VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);


        // Emit code to correctly handle reused reduced values, if required.

        if (OptReusedScalars && !SameScaleFactor) {

          VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,

                                         SameValuesCounter, TrackedToOrig);

        }


        Type *ScalarTy = VL.front()->getType();

        Type *VecTy = VectorizedRoot->getType();

        Type *RedScalarTy = VecTy->getScalarType();

        VectorValuesAndScales.emplace_back(

            VectorizedRoot,

            OptReusedScalars && SameScaleFactor

                ? SameValuesCounter.front().second

                : 1,

            RedScalarTy != ScalarTy->getScalarType()

                ? V.isSignedMinBitwidthRootNode()

                : true);


        // Count vectorized reduced values to exclude them from final reduction.

        for (Value *RdxVal : VL) {

          Value *OrigV = TrackedToOrig.at(RdxVal);

          if (IsSupportedHorRdxIdentityOp) {

            VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));

            continue;

          }

          ++VectorizedVals.try_emplace(OrigV).first->getSecond();

          if (!V.isVectorized(RdxVal))

            RequiredExtract.insert(RdxVal);

        }

        Pos += ReduxWidth;

        Start = Pos;

        ReduxWidth = NumReducedVals - Pos;

        if (ReduxWidth > 1)

          ReduxWidth = GetVectorFactor(NumReducedVals - Pos);

        AnyVectorized = true;

      }

      if (OptReusedScalars && !AnyVectorized) {

        for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {

          Value *RdxVal = TrackedVals.at(P.first);

          Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);

          VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

          VectorizedVals.try_emplace(P.first, P.second);

        }

        continue;

      }

    }

    if (!VectorValuesAndScales.empty())

      VectorizedTree = GetNewVectorizedTree(

          VectorizedTree,

          emitReduction(Builder, *TTI, ReductionRoot->getType()));

    if (VectorizedTree) {

      // Reorder operands of bool logical op in the natural order to avoid

      // possible problem with poison propagation. If not possible to reorder

      // (both operands are originally RHS), emit an extra freeze instruction

      // for the LHS operand.

      // I.e., if we have original code like this:

      // RedOp1 = select i1 ?, i1 LHS, i1 false

      // RedOp2 = select i1 RHS, i1 ?, i1 false


      // Then, we swap LHS/RHS to create a new op that matches the poison

      // semantics of the original code.


      // If we have original code like this and both values could be poison:

      // RedOp1 = select i1 ?, i1 LHS, i1 false

      // RedOp2 = select i1 ?, i1 RHS, i1 false


      // Then, we must freeze LHS in the new op.

      auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,

                                                   Instruction *RedOp1,

                                                   Instruction *RedOp2,

                                                   bool InitStep) {

        if (!AnyBoolLogicOp)

          return;

        if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||

                                      getRdxOperand(RedOp1, 0) == LHS ||

                                      isGuaranteedNotToBePoison(LHS, AC)))

          return;

        if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||

                                      getRdxOperand(RedOp2, 0) == RHS ||

                                      isGuaranteedNotToBePoison(RHS, AC))) {

          std::swap(LHS, RHS);

          return;

        }

        if (LHS != VectorizedTree)

          LHS = Builder.CreateFreeze(LHS);

      };

      // Finish the reduction.

      // Need to add extra arguments and not vectorized possible reduction

      // values.

      // Try to avoid dependencies between the scalar remainders after

      // reductions.

      auto FinalGen =

          [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,

              bool InitStep) {

            unsigned Sz = InstVals.size();

            SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +

                                                                     Sz % 2);

            for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {

              Instruction *RedOp = InstVals[I + 1].first;

              Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());

              Value *RdxVal1 = InstVals[I].second;

              Value *StableRdxVal1 = RdxVal1;

              auto It1 = TrackedVals.find(RdxVal1);

              if (It1 != TrackedVals.end())

                StableRdxVal1 = It1->second;

              Value *RdxVal2 = InstVals[I + 1].second;

              Value *StableRdxVal2 = RdxVal2;

              auto It2 = TrackedVals.find(RdxVal2);

              if (It2 != TrackedVals.end())

                StableRdxVal2 = It2->second;

              // To prevent poison from leaking across what used to be

              // sequential, safe, scalar boolean logic operations, the

              // reduction operand must be frozen.

              FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,

                                RedOp, InitStep);

              Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,

                                         StableRdxVal2, "op.rdx", ReductionOps);

              ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);

            }

            if (Sz % 2 == 1)

              ExtraReds[Sz / 2] = InstVals.back();

            return ExtraReds;

          };

      SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;

      ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),

                                   VectorizedTree);

      SmallPtrSet<Value *, 8> Visited;

      for (ArrayRef<Value *> Candidates : ReducedVals) {

        for (Value *RdxVal : Candidates) {

          if (!Visited.insert(RdxVal).second)

            continue;

          unsigned NumOps = VectorizedVals.lookup(RdxVal);

          for (Instruction *RedOp :

               ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))

            ExtraReductions.emplace_back(RedOp, RdxVal);

        }

      }

      // Iterate through all not-vectorized reduction values/extra arguments.

      bool InitStep = true;

      while (ExtraReductions.size() > 1) {

        SmallVector<std::pair<Instruction *, Value *>> NewReds =

            FinalGen(ExtraReductions, InitStep);

        ExtraReductions.swap(NewReds);

        InitStep = false;

      }

      VectorizedTree = ExtraReductions.front().second;


      ReductionRoot->replaceAllUsesWith(VectorizedTree);


      // The original scalar reduction is expected to have no remaining

      // uses outside the reduction tree itself.  Assert that we got this

      // correct, replace internal uses with undef, and mark for eventual

      // deletion.

#ifndef NDEBUG

      SmallPtrSet<Value *, 4> IgnoreSet;

      for (ArrayRef<Value *> RdxOps : ReductionOps)

        IgnoreSet.insert_range(RdxOps);

#endif

      for (ArrayRef<Value *> RdxOps : ReductionOps) {

        for (Value *Ignore : RdxOps) {

          if (!Ignore)

            continue;

#ifndef NDEBUG

          for (auto *U : Ignore->users()) {

            assert(IgnoreSet.count(U) &&

                   "All users must be either in the reduction ops list.");

          }

#endif

          if (!Ignore->use_empty()) {

            Value *P = PoisonValue::get(Ignore->getType());

            Ignore->replaceAllUsesWith(P);

          }

        }

        V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);

      }

    } else if (!CheckForReusedReductionOps) {

      for (ReductionOpsType &RdxOps : ReductionOps)

        for (Value *RdxOp : RdxOps)

          V.analyzedReductionRoot(cast<Instruction>(RdxOp));

    }

    return VectorizedTree;

  }


private:

  /// Creates the reduction from the given \p Vec vector value with the given

  /// scale \p Scale and signedness \p IsSigned.

  Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,

                        Value *Vec, unsigned Scale, bool IsSigned,

                        Type *DestTy) {

    Value *Rdx;

    if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {

      unsigned DestTyNumElements = getNumElements(VecTy);

      unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;

      Rdx = PoisonValue::get(

          getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));

      for (unsigned I : seq<unsigned>(DestTyNumElements)) {

        // Do reduction for each lane.

        // e.g., do reduce add for

        // VL[0] = <4 x Ty> <a, b, c, d>

        // VL[1] = <4 x Ty> <e, f, g, h>

        // Lane[0] = <2 x Ty> <a, e>

        // Lane[1] = <2 x Ty> <b, f>

        // Lane[2] = <2 x Ty> <c, g>

        // Lane[3] = <2 x Ty> <d, h>

        // result[0] = reduce add Lane[0]

        // result[1] = reduce add Lane[1]

        // result[2] = reduce add Lane[2]

        // result[3] = reduce add Lane[3]

        SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);

        Value *Lane = Builder.CreateShuffleVector(Vec, Mask);

        Rdx = Builder.CreateInsertElement(

            Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);

      }

    } else {

      Rdx = emitReduction(Vec, Builder, &TTI, DestTy);

    }

    if (Rdx->getType() != DestTy)

      Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);

    // Improved analysis for add/fadd/xor reductions with same scale

    // factor for all operands of reductions. We can emit scalar ops for

    // them instead.

    if (Scale > 1)

      Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);

    return Rdx;

  }


  /// Calculate the cost of a reduction.

  InstructionCost getReductionCost(TargetTransformInfo *TTI,

                                   ArrayRef<Value *> ReducedVals,

                                   bool IsCmpSelMinMax, FastMathFlags FMF,

                                   const BoUpSLP &R) {

    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

    Type *ScalarTy = ReducedVals.front()->getType();

    unsigned ReduxWidth = ReducedVals.size();

    FixedVectorType *VectorTy = R.getReductionType();

    InstructionCost VectorCost = 0, ScalarCost;

    // If all of the reduced values are constant, the vector cost is 0, since

    // the reduction value can be calculated at the compile time.

    bool AllConsts = allConstant(ReducedVals);

    auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {

      InstructionCost Cost = 0;

      // Scalar cost is repeated for N-1 elements.

      int Cnt = ReducedVals.size();

      for (Value *RdxVal : ReducedVals) {

        if (Cnt == 1)

          break;

        --Cnt;

        if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {

          Cost += GenCostFn();

          continue;

        }

        InstructionCost ScalarCost = 0;

        for (User *U : RdxVal->users()) {

          auto *RdxOp = cast<Instruction>(U);

          if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {

            ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);

            continue;

          }

          ScalarCost = InstructionCost::getInvalid();

          break;

        }

        if (ScalarCost.isValid())

          Cost += ScalarCost;

        else

          Cost += GenCostFn();

      }

      return Cost;

    };

    // Require reduction cost if:

    // 1. This type is not a full register type and no other vectors with the

    // same type in the storage (first vector with small type).

    // 2. The storage does not have any vector with full vector use (first

    // vector with full register use).

    bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();

    switch (RdxKind) {

    case RecurKind::Add:

    case RecurKind::Mul:

    case RecurKind::Or:

    case RecurKind::And:

    case RecurKind::Xor:

    case RecurKind::FAdd:

    case RecurKind::FMul: {

      unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);

      if (!AllConsts) {

        if (DoesRequireReductionOp) {

          if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

            assert(SLPReVec && "FixedVectorType is not expected.");

            unsigned ScalarTyNumElements = VecTy->getNumElements();

            for (unsigned I : seq<unsigned>(ReducedVals.size())) {

              VectorCost += TTI->getShuffleCost(

                  TTI::SK_PermuteSingleSrc,

                  FixedVectorType::get(VecTy->getScalarType(),

                                       ReducedVals.size()),

                  VectorTy,

                  createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));

              VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,

                                                            FMF, CostKind);

            }

            VectorCost += TTI->getScalarizationOverhead(

                VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,

                /*Extract*/ false, TTI::TCK_RecipThroughput);

          } else {

            Type *RedTy = VectorTy->getElementType();

            auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(

                std::make_pair(RedTy, true));

            if (RType == RedTy) {

              VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,

                                                           FMF, CostKind);

            } else {

              VectorCost = TTI->getExtendedReductionCost(

                  RdxOpcode, !IsSigned, RedTy,

                  getWidenedType(RType, ReduxWidth), FMF, CostKind);

            }

          }

        } else {

          Type *RedTy = VectorTy->getElementType();

          auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(

              std::make_pair(RedTy, true));

          VectorType *RVecTy = getWidenedType(RType, ReduxWidth);

          VectorCost +=

              TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);

          if (RType != RedTy) {

            unsigned Opcode = Instruction::Trunc;

            if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())

              Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

            VectorCost += TTI->getCastInstrCost(

                Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);

          }

        }

      }

      ScalarCost = EvaluateScalarCost([&]() {

        return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);

      });

      break;

    }

    case RecurKind::FMax:

    case RecurKind::FMin:

    case RecurKind::FMaximum:

    case RecurKind::FMinimum:

    case RecurKind::SMax:

    case RecurKind::SMin:

    case RecurKind::UMax:

    case RecurKind::UMin: {

      Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);

      if (!AllConsts) {

        if (DoesRequireReductionOp) {

          VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);

        } else {

          // Check if the previous reduction already exists and account it as

          // series of operations + single reduction.

          Type *RedTy = VectorTy->getElementType();

          auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(

              std::make_pair(RedTy, true));

          VectorType *RVecTy = getWidenedType(RType, ReduxWidth);

          IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);

          VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);

          if (RType != RedTy) {

            unsigned Opcode = Instruction::Trunc;

            if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())

              Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

            VectorCost += TTI->getCastInstrCost(

                Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);

          }

        }

      }

      ScalarCost = EvaluateScalarCost([&]() {

        IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);

        return TTI->getIntrinsicInstrCost(ICA, CostKind);

      });

      break;

    }

    default:

      llvm_unreachable("Expected arithmetic or min/max reduction operation");

    }


    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost

                      << " for reduction of " << shortBundleName(ReducedVals)

                      << " (It is a splitting reduction)\n");

    return VectorCost - ScalarCost;

  }


  /// Splits the values, stored in VectorValuesAndScales, into registers/free

  /// sub-registers, combines them with the given reduction operation as a

  /// vector operation and then performs single (small enough) reduction.

  Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,

                       Type *DestTy) {

    Value *ReducedSubTree = nullptr;

    // Creates reduction and combines with the previous reduction.

    auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {

      Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);

      if (ReducedSubTree)

        ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,

                                  "op.rdx", ReductionOps);

      else

        ReducedSubTree = Rdx;

    };

    if (VectorValuesAndScales.size() == 1) {

      const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();

      CreateSingleOp(Vec, Scale, IsSigned);

      return ReducedSubTree;

    }

    // Scales Vec using given Cnt scale factor and then performs vector combine

    // with previous value of VecOp.

    Value *VecRes = nullptr;

    bool VecResSignedness = false;

    auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {

      Type *ScalarTy = Vec->getType()->getScalarType();

      // Scale Vec using given Cnt scale factor.

      if (Cnt > 1) {

        ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();

        switch (RdxKind) {

        case RecurKind::Add: {

          if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {

            unsigned VF = getNumElements(Vec->getType());

            LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec

                              << ". (HorRdx)\n");

            SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);

            for (unsigned I : seq<unsigned>(Cnt))

              std::iota(std::next(Mask.begin(), VF * I),

                        std::next(Mask.begin(), VF * (I + 1)), 0);

            ++NumVectorInstructions;

            Vec = Builder.CreateShuffleVector(Vec, Mask);

            break;

          }

          // res = mul vv, n

          if (ScalarTy != DestTy->getScalarType())

            Vec = Builder.CreateIntCast(

                Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),

                IsSigned);

          Value *Scale = ConstantVector::getSplat(

              EC, ConstantInt::get(DestTy->getScalarType(), Cnt));

          LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec

                            << ". (HorRdx)\n");

          ++NumVectorInstructions;

          Vec = Builder.CreateMul(Vec, Scale);

          break;

        }

        case RecurKind::Xor: {

          // res = n % 2 ? 0 : vv

          LLVM_DEBUG(dbgs()

                     << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");

          if (Cnt % 2 == 0)

            Vec = Constant::getNullValue(Vec->getType());

          break;

        }

        case RecurKind::FAdd: {

          // res = fmul v, n

          Value *Scale =

              ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));

          LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec

                            << ". (HorRdx)\n");

          ++NumVectorInstructions;

          Vec = Builder.CreateFMul(Vec, Scale);

          break;

        }

        case RecurKind::And:

        case RecurKind::Or:

        case RecurKind::SMax:

        case RecurKind::SMin:

        case RecurKind::UMax:

        case RecurKind::UMin:

        case RecurKind::FMax:

        case RecurKind::FMin:

        case RecurKind::FMaximum:

        case RecurKind::FMinimum:

          // res = vv

          break;

        case RecurKind::Sub:

        case RecurKind::AddChainWithSubs:

        case RecurKind::Mul:

        case RecurKind::FMul:

        case RecurKind::FMulAdd:

        case RecurKind::AnyOf:

        case RecurKind::FindFirstIVSMin:

        case RecurKind::FindFirstIVUMin:

        case RecurKind::FindLastIVSMax:

        case RecurKind::FindLastIVUMax:

        case RecurKind::FMaxNum:

        case RecurKind::FMinNum:

        case RecurKind::FMaximumNum:

        case RecurKind::FMinimumNum:

        case RecurKind::None:

          llvm_unreachable("Unexpected reduction kind for repeated scalar.");

        }

      }

      // Combine Vec with the previous VecOp.

      if (!VecRes) {

        VecRes = Vec;

        VecResSignedness = IsSigned;

      } else {

        ++NumVectorInstructions;

        if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&

            VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {

          // Handle ctpop.

          unsigned VecResVF = getNumElements(VecRes->getType());

          unsigned VecVF = getNumElements(Vec->getType());

          SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);

          std::iota(Mask.begin(), Mask.end(), 0);

          // Ensure that VecRes is always larger than Vec

          if (VecResVF < VecVF) {

            std::swap(VecRes, Vec);

            std::swap(VecResVF, VecVF);

          }

          if (VecResVF != VecVF) {

            SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);

            std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);

            Vec = Builder.CreateShuffleVector(Vec, ResizeMask);

          }

          VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");

          return;

        }

        if (VecRes->getType()->getScalarType() != DestTy->getScalarType())

          VecRes = Builder.CreateIntCast(

              VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),

              VecResSignedness);

        if (ScalarTy != DestTy->getScalarType())

          Vec = Builder.CreateIntCast(

              Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),

              IsSigned);

        unsigned VecResVF = getNumElements(VecRes->getType());

        unsigned VecVF = getNumElements(Vec->getType());

        // Ensure that VecRes is always larger than Vec

        if (VecResVF < VecVF) {

          std::swap(VecRes, Vec);

          std::swap(VecResVF, VecVF);

        }

        // extract + op + insert

        Value *Op = VecRes;

        if (VecResVF != VecVF)

          Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);

        Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);

        if (VecResVF != VecVF)

          Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);

        VecRes = Op;

      }

    };

    for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)

      CreateVecOp(Vec, Scale, IsSigned);

    CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);


    return ReducedSubTree;

  }


  /// Emit a horizontal reduction of the vectorized value.

  Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,

                       const TargetTransformInfo *TTI, Type *DestTy) {

    assert(VectorizedValue && "Need to have a vectorized tree node");

    assert(RdxKind != RecurKind::FMulAdd &&

           "A call to the llvm.fmuladd intrinsic is not handled yet");


    auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());

    if (FTy->getScalarType() == Builder.getInt1Ty() &&

        RdxKind == RecurKind::Add &&

        DestTy->getScalarType() != FTy->getScalarType()) {

      // Convert vector_reduce_add(ZExt(<n x i1>)) to

      // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).

      Value *V = Builder.CreateBitCast(

          VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));

      ++NumVectorInstructions;

      return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);

    }

    ++NumVectorInstructions;

    return createSimpleReduction(Builder, VectorizedValue, RdxKind);

  }


  /// Emits optimized code for unique scalar value reused \p Cnt times.

  Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,

                               unsigned Cnt) {

    assert(IsSupportedHorRdxIdentityOp &&

           "The optimization of matched scalar identity horizontal reductions "

           "must be supported.");

    if (Cnt == 1)

      return VectorizedValue;

    switch (RdxKind) {

    case RecurKind::Add: {

      // res = mul vv, n

      Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);

      LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "

                        << VectorizedValue << ". (HorRdx)\n");

      return Builder.CreateMul(VectorizedValue, Scale);

    }

    case RecurKind::Xor: {

      // res = n % 2 ? 0 : vv

      LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue

                        << ". (HorRdx)\n");

      if (Cnt % 2 == 0)

        return Constant::getNullValue(VectorizedValue->getType());

      return VectorizedValue;

    }

    case RecurKind::FAdd: {

      // res = fmul v, n

      Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);

      LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "

                        << VectorizedValue << ". (HorRdx)\n");

      return Builder.CreateFMul(VectorizedValue, Scale);

    }

    case RecurKind::And:

    case RecurKind::Or:

    case RecurKind::SMax:

    case RecurKind::SMin:

    case RecurKind::UMax:

    case RecurKind::UMin:

    case RecurKind::FMax:

    case RecurKind::FMin:

    case RecurKind::FMaximum:

    case RecurKind::FMinimum:

      // res = vv

      return VectorizedValue;

    case RecurKind::Sub:

    case RecurKind::AddChainWithSubs:

    case RecurKind::Mul:

    case RecurKind::FMul:

    case RecurKind::FMulAdd:

    case RecurKind::AnyOf:

    case RecurKind::FindFirstIVSMin:

    case RecurKind::FindFirstIVUMin:

    case RecurKind::FindLastIVSMax:

    case RecurKind::FindLastIVUMax:

    case RecurKind::FMaxNum:

    case RecurKind::FMinNum:

    case RecurKind::FMaximumNum:

    case RecurKind::FMinimumNum:

    case RecurKind::None:

      llvm_unreachable("Unexpected reduction kind for repeated scalar.");

    }

    return nullptr;

  }


  /// Emits actual operation for the scalar identity values, found during

  /// horizontal reduction analysis.

  Value *

  emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,

                const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,

                const DenseMap<Value *, Value *> &TrackedToOrig) {

    assert(IsSupportedHorRdxIdentityOp &&

           "The optimization of matched scalar identity horizontal reductions "

           "must be supported.");

    ArrayRef<Value *> VL = R.getRootNodeScalars();

    auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());

    if (VTy->getElementType() != VL.front()->getType()) {

      VectorizedValue = Builder.CreateIntCast(

          VectorizedValue,

          getWidenedType(VL.front()->getType(), VTy->getNumElements()),

          R.isSignedMinBitwidthRootNode());

    }

    switch (RdxKind) {

    case RecurKind::Add: {

      // root = mul prev_root, <1, 1, n, 1>

      SmallVector<Constant *> Vals;

      for (Value *V : VL) {

        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

        Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));

      }

      auto *Scale = ConstantVector::get(Vals);

      LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "

                        << VectorizedValue << ". (HorRdx)\n");

      return Builder.CreateMul(VectorizedValue, Scale);

    }

    case RecurKind::And:

    case RecurKind::Or:

      // No need for multiple or/and(s).

      LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue

                        << ". (HorRdx)\n");

      return VectorizedValue;

    case RecurKind::SMax:

    case RecurKind::SMin:

    case RecurKind::UMax:

    case RecurKind::UMin:

    case RecurKind::FMax:

    case RecurKind::FMin:

    case RecurKind::FMaximum:

    case RecurKind::FMinimum:

      // No need for multiple min/max(s) of the same value.

      LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue

                        << ". (HorRdx)\n");

      return VectorizedValue;

    case RecurKind::Xor: {

      // Replace values with even number of repeats with 0, since

      // x xor x = 0.

      // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,

      // 7>, if elements 4th and 6th elements have even number of repeats.

      SmallVector<int> Mask(

          cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),

          PoisonMaskElem);

      std::iota(Mask.begin(), Mask.end(), 0);

      bool NeedShuffle = false;

      for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {

        Value *V = VL[I];

        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

        if (Cnt % 2 == 0) {

          Mask[I] = VF;

          NeedShuffle = true;

        }

      }

      LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I

                                              : Mask) dbgs()

                                         << I << " ";

                 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");

      if (NeedShuffle)

        VectorizedValue = Builder.CreateShuffleVector(

            VectorizedValue,

            ConstantVector::getNullValue(VectorizedValue->getType()), Mask);

      return VectorizedValue;

    }

    case RecurKind::FAdd: {

      // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>

      SmallVector<Constant *> Vals;

      for (Value *V : VL) {

        unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

        Vals.push_back(ConstantFP::get(V->getType(), Cnt));

      }

      auto *Scale = ConstantVector::get(Vals);

      return Builder.CreateFMul(VectorizedValue, Scale);

    }

    case RecurKind::Sub:

    case RecurKind::AddChainWithSubs:

    case RecurKind::Mul:

    case RecurKind::FMul:

    case RecurKind::FMulAdd:

    case RecurKind::AnyOf:

    case RecurKind::FindFirstIVSMin:

    case RecurKind::FindFirstIVUMin:

    case RecurKind::FindLastIVSMax:

    case RecurKind::FindLastIVUMax:

    case RecurKind::FMaxNum:

    case RecurKind::FMinNum:

    case RecurKind::FMaximumNum:

    case RecurKind::FMinimumNum:

    case RecurKind::None:

      llvm_unreachable("Unexpected reduction kind for reused scalars.");

    }

    return nullptr;

  }

};

} // end anonymous namespace


/// Gets recurrence kind from the specified value.

static RecurKind getRdxKind(Value *V) {

  return HorizontalReduction::getRdxKind(V);

}

static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {

  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))

    return cast<FixedVectorType>(IE->getType())->getNumElements();


  unsigned AggregateSize = 1;

  auto *IV = cast<InsertValueInst>(InsertInst);

  Type *CurrentType = IV->getType();

  do {

    if (auto *ST = dyn_cast<StructType>(CurrentType)) {

      for (auto *Elt : ST->elements())

        if (Elt != ST->getElementType(0)) // check homogeneity

          return std::nullopt;

      AggregateSize *= ST->getNumElements();

      CurrentType = ST->getElementType(0);

    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {

      AggregateSize *= AT->getNumElements();

      CurrentType = AT->getElementType();

    } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {

      AggregateSize *= VT->getNumElements();

      return AggregateSize;

    } else if (CurrentType->isSingleValueType()) {

      return AggregateSize;

    } else {

      return std::nullopt;

    }

  } while (true);

}


static void findBuildAggregateRec(Instruction *LastInsertInst,

                                  TargetTransformInfo *TTI,

                                  SmallVectorImpl<Value *> &BuildVectorOpds,

                                  SmallVectorImpl<Value *> &InsertElts,

                                  unsigned OperandOffset, const BoUpSLP &R) {

  do {

    Value *InsertedOperand = LastInsertInst->getOperand(1);

    std::optional<unsigned> OperandIndex =

        getElementIndex(LastInsertInst, OperandOffset);

    if (!OperandIndex || R.isDeleted(LastInsertInst))

      return;

    if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {

      findBuildAggregateRec(cast<Instruction>(InsertedOperand), TTI,

                            BuildVectorOpds, InsertElts, *OperandIndex, R);


    } else {

      BuildVectorOpds[*OperandIndex] = InsertedOperand;

      InsertElts[*OperandIndex] = LastInsertInst;

    }

    LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));

  } while (LastInsertInst != nullptr &&

           isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&

           LastInsertInst->hasOneUse());

}


/// Recognize construction of vectors like

///  %ra = insertelement <4 x float> poison, float %s0, i32 0

///  %rb = insertelement <4 x float> %ra, float %s1, i32 1

///  %rc = insertelement <4 x float> %rb, float %s2, i32 2

///  %rd = insertelement <4 x float> %rc, float %s3, i32 3

///  starting from the last insertelement or insertvalue instruction.

///

/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},

/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.

/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.

///

/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.

///

/// \return true if it matches.

static bool findBuildAggregate(Instruction *LastInsertInst,

                               TargetTransformInfo *TTI,

                               SmallVectorImpl<Value *> &BuildVectorOpds,

                               SmallVectorImpl<Value *> &InsertElts,

                               const BoUpSLP &R) {


  assert((isa<InsertElementInst>(LastInsertInst) ||

          isa<InsertValueInst>(LastInsertInst)) &&

         "Expected insertelement or insertvalue instruction!");


  assert((BuildVectorOpds.empty() && InsertElts.empty()) &&

         "Expected empty result vectors!");


  std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);

  if (!AggregateSize)

    return false;

  BuildVectorOpds.resize(*AggregateSize);

  InsertElts.resize(*AggregateSize);


  findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);

  llvm::erase(BuildVectorOpds, nullptr);

  llvm::erase(InsertElts, nullptr);

  if (BuildVectorOpds.size() >= 2)

    return true;


  return false;

}


/// Try and get a reduction instruction from a phi node.

///

/// Given a phi node \p P in a block \p ParentBB, consider possible reductions

/// if they come from either \p ParentBB or a containing loop latch.

///

/// \returns A candidate reduction value if possible, or \code nullptr \endcode

/// if not possible.

static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,

                                      BasicBlock *ParentBB, LoopInfo *LI) {

  // There are situations where the reduction value is not dominated by the

  // reduction phi. Vectorizing such cases has been reported to cause

  // miscompiles. See PR25787.

  auto DominatedReduxValue = [&](Value *R) {

    return isa<Instruction>(R) &&

           DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());

  };


  Instruction *Rdx = nullptr;


  // Return the incoming value if it comes from the same BB as the phi node.

  if (P->getIncomingBlock(0) == ParentBB) {

    Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

  } else if (P->getIncomingBlock(1) == ParentBB) {

    Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

  }


  if (Rdx && DominatedReduxValue(Rdx))

    return Rdx;


  // Otherwise, check whether we have a loop latch to look at.

  Loop *BBL = LI->getLoopFor(ParentBB);

  if (!BBL)

    return nullptr;

  BasicBlock *BBLatch = BBL->getLoopLatch();

  if (!BBLatch)

    return nullptr;


  // There is a loop latch, return the incoming value if it comes from

  // that. This reduction pattern occasionally turns up.

  if (P->getIncomingBlock(0) == BBLatch) {

    Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

  } else if (P->getIncomingBlock(1) == BBLatch) {

    Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

  }


  if (Rdx && DominatedReduxValue(Rdx))

    return Rdx;


  return nullptr;

}


static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {

  if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))

    return true;

  if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))

    return true;

  return false;

}


/// We could have an initial reduction that is not an add.

///  r *= v1 + v2 + v3 + v4

/// In such a case start looking for a tree rooted in the first '+'.

/// \Returns the new root if found, which may be nullptr if not an instruction.

static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,

                                                 Instruction *Root) {

  assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||

          isa<IntrinsicInst>(Root)) &&

         "Expected binop, select, or intrinsic for reduction matching");

  Value *LHS =

      Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));

  Value *RHS =

      Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);

  if (LHS == Phi)

    return dyn_cast<Instruction>(RHS);

  if (RHS == Phi)

    return dyn_cast<Instruction>(LHS);

  return nullptr;

}


/// \p Returns the first operand of \p I that does not match \p Phi. If

/// operand is not an instruction it returns nullptr.

static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {

  Value *Op0 = nullptr;

  Value *Op1 = nullptr;

  if (!matchRdxBop(I, Op0, Op1))

    return nullptr;

  return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);

}


/// \Returns true if \p I is a candidate instruction for reduction vectorization.

static bool isReductionCandidate(Instruction *I) {

  bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));

  Value *B0 = nullptr, *B1 = nullptr;

  bool IsBinop = matchRdxBop(I, B0, B1);

  return IsBinop || IsSelect;

}


bool SLPVectorizerPass::vectorizeHorReduction(

    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,

    SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {

  if (!ShouldVectorizeHor)

    return false;

  bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);


  if (Root->getParent() != BB || isa<PHINode>(Root))

    return false;


  // If we can find a secondary reduction root, use that instead.

  auto SelectRoot = [&]() {

    if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&

        HorizontalReduction::getRdxKind(Root) != RecurKind::None)

      if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))

        return NewRoot;

    return Root;

  };


  // Start analysis starting from Root instruction. If horizontal reduction is

  // found, try to vectorize it. If it is not a horizontal reduction or

  // vectorization is not possible or not effective, and currently analyzed

  // instruction is a binary operation, try to vectorize the operands, using

  // pre-order DFS traversal order. If the operands were not vectorized, repeat

  // the same procedure considering each operand as a possible root of the

  // horizontal reduction.

  // Interrupt the process if the Root instruction itself was vectorized or all

  // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.

  // If a horizintal reduction was not matched or vectorized we collect

  // instructions for possible later attempts for vectorization.

  std::queue<std::pair<Instruction *, unsigned>> Stack;

  Stack.emplace(SelectRoot(), 0);

  SmallPtrSet<Value *, 8> VisitedInstrs;

  bool Res = false;

  auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {

    if (R.isAnalyzedReductionRoot(Inst))

      return nullptr;

    if (!isReductionCandidate(Inst))

      return nullptr;

    HorizontalReduction HorRdx;

    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))

      return nullptr;

    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);

  };

  auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {

    if (TryOperandsAsNewSeeds && FutureSeed == Root) {

      FutureSeed = getNonPhiOperand(Root, P);

      if (!FutureSeed)

        return false;

    }

    // Do not collect CmpInst or InsertElementInst/InsertValueInst as their

    // analysis is done separately.

    if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))

      PostponedInsts.push_back(FutureSeed);

    return true;

  };


  while (!Stack.empty()) {

    Instruction *Inst;

    unsigned Level;

    std::tie(Inst, Level) = Stack.front();

    Stack.pop();

    // Do not try to analyze instruction that has already been vectorized.

    // This may happen when we vectorize instruction operands on a previous

    // iteration while stack was populated before that happened.

    if (R.isDeleted(Inst))

      continue;

    if (Value *VectorizedV = TryToReduce(Inst)) {

      Res = true;

      if (auto *I = dyn_cast<Instruction>(VectorizedV)) {

        // Try to find another reduction.

        Stack.emplace(I, Level);

        continue;

      }

      if (R.isDeleted(Inst))

        continue;

    } else {

      // We could not vectorize `Inst` so try to use it as a future seed.

      if (!TryAppendToPostponedInsts(Inst)) {

        assert(Stack.empty() && "Expected empty stack");

        break;

      }

    }


    // Try to vectorize operands.

    // Continue analysis for the instruction from the same basic block only to

    // save compile time.

    if (++Level < RecursionMaxDepth)

      for (auto *Op : Inst->operand_values())

        if (VisitedInstrs.insert(Op).second)

          if (auto *I = dyn_cast<Instruction>(Op))

            // Do not try to vectorize CmpInst operands,  this is done

            // separately.

            if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&

                !R.isDeleted(I) && I->getParent() == BB)

              Stack.emplace(I, Level);

  }

  return Res;

}


bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {

  if (!I)

    return false;


  if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))

    return false;

  // Skip potential FMA candidates.

  if ((I->getOpcode() == Instruction::FAdd ||

       I->getOpcode() == Instruction::FSub) &&

      canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)

          .isValid())

    return false;


  Value *P = I->getParent();


  // Vectorize in current basic block only.

  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));

  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));

  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||

      R.isDeleted(Op0) || R.isDeleted(Op1))

    return false;


  // First collect all possible candidates

  SmallVector<std::pair<Value *, Value *>, 4> Candidates;

  Candidates.emplace_back(Op0, Op1);


  auto *A = dyn_cast<BinaryOperator>(Op0);

  auto *B = dyn_cast<BinaryOperator>(Op1);

  // Try to skip B.

  if (A && B && B->hasOneUse()) {

    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

    if (B0 && B0->getParent() == P && !R.isDeleted(B0))

      Candidates.emplace_back(A, B0);

    if (B1 && B1->getParent() == P && !R.isDeleted(B1))

      Candidates.emplace_back(A, B1);

  }

  // Try to skip A.

  if (B && A && A->hasOneUse()) {

    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

    if (A0 && A0->getParent() == P && !R.isDeleted(A0))

      Candidates.emplace_back(A0, B);

    if (A1 && A1->getParent() == P && !R.isDeleted(A1))

      Candidates.emplace_back(A1, B);

  }


  auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,

                                             ArrayRef<Value *> Ops) {

    if (!isReductionCandidate(Inst))

      return false;

    Type *Ty = Inst->getType();

    if (!isValidElementType(Ty) || Ty->isPointerTy())

      return false;

    HorizontalReduction HorRdx(Inst, Ops);

    if (!HorRdx.matchReductionForOperands())

      return false;

    // Check the cost of operations.

    VectorType *VecTy = getWidenedType(Ty, Ops.size());

    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

    InstructionCost ScalarCost =

        TTI.getScalarizationOverhead(

            VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,

            /*Extract=*/true, CostKind) +

        TTI.getInstructionCost(Inst, CostKind);

    InstructionCost RedCost;

    switch (::getRdxKind(Inst)) {

    case RecurKind::Add:

    case RecurKind::Mul:

    case RecurKind::Or:

    case RecurKind::And:

    case RecurKind::Xor:

    case RecurKind::FAdd:

    case RecurKind::FMul: {

      FastMathFlags FMF;

      if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))

        FMF = FPCI->getFastMathFlags();

      RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,

                                               CostKind);

      break;

    }

    default:

      return false;

    }

    if (RedCost >= ScalarCost)

      return false;


    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;

  };

  if (Candidates.size() == 1)

    return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);


  // We have multiple options. Try to pick the single best.

  std::optional<int> BestCandidate = R.findBestRootPair(Candidates);

  if (!BestCandidate)

    return false;

  return (*BestCandidate == 0 &&

          TryToReduce(I, {Candidates[*BestCandidate].first,

                          Candidates[*BestCandidate].second})) ||

         tryToVectorizeList({Candidates[*BestCandidate].first,

                             Candidates[*BestCandidate].second},

                            R);

}


bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,

                                                 BasicBlock *BB, BoUpSLP &R) {

  SmallVector<WeakTrackingVH> PostponedInsts;

  bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);

  Res |= tryToVectorize(PostponedInsts, R);

  return Res;

}


bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,

                                       BoUpSLP &R) {

  bool Res = false;

  for (Value *V : Insts)

    if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))

      Res |= tryToVectorize(Inst, R);

  return Res;

}


bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

                                                 BasicBlock *BB, BoUpSLP &R,

                                                 bool MaxVFOnly) {

  if (!R.canMapToVector(IVI->getType()))

    return false;


  SmallVector<Value *, 16> BuildVectorOpds;

  SmallVector<Value *, 16> BuildVectorInsts;

  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))

    return false;


  if (MaxVFOnly && BuildVectorOpds.size() == 2) {

    R.getORE()->emit([&]() {

      return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)

             << "Cannot SLP vectorize list: only 2 elements of buildvalue, "

                "trying reduction first.";

    });

    return false;

  }

  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");

  // Aggregate value is unlikely to be processed in vector register.

  return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);

}


bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

                                                   BasicBlock *BB, BoUpSLP &R,

                                                   bool MaxVFOnly) {

  SmallVector<Value *, 16> BuildVectorInsts;

  SmallVector<Value *, 16> BuildVectorOpds;

  SmallVector<int> Mask;

  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||

      (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&

       isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))

    return false;


  if (MaxVFOnly && BuildVectorInsts.size() == 2) {

    R.getORE()->emit([&]() {

      return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)

             << "Cannot SLP vectorize list: only 2 elements of buildvector, "

                "trying reduction first.";

    });

    return false;

  }

  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");

  return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);

}


template <typename T>

static bool tryToVectorizeSequence(

    SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,

    function_ref<bool(T *, T *)> AreCompatible,

    function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,

    bool MaxVFOnly, BoUpSLP &R) {

  bool Changed = false;

  // Sort by type, parent, operands.

  stable_sort(Incoming, Comparator);


  // Try to vectorize elements base on their type.

  SmallVector<T *> Candidates;

  SmallVector<T *> VL;

  for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;

       VL.clear()) {

    // Look for the next elements with the same type, parent and operand

    // kinds.

    auto *I = dyn_cast<Instruction>(*IncIt);

    if (!I || R.isDeleted(I)) {

      ++IncIt;

      continue;

    }

    auto *SameTypeIt = IncIt;

    while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||

                               R.isDeleted(cast<Instruction>(*SameTypeIt)) ||

                               AreCompatible(*SameTypeIt, *IncIt))) {

      auto *I = dyn_cast<Instruction>(*SameTypeIt);

      ++SameTypeIt;

      if (I && !R.isDeleted(I))

        VL.push_back(cast<T>(I));

    }


    // Try to vectorize them.

    unsigned NumElts = VL.size();

    LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("

                      << NumElts << ")\n");

    // The vectorization is a 3-state attempt:

    // 1. Try to vectorize instructions with the same/alternate opcodes with the

    // size of maximal register at first.

    // 2. Try to vectorize remaining instructions with the same type, if

    // possible. This may result in the better vectorization results rather than

    // if we try just to vectorize instructions with the same/alternate opcodes.

    // 3. Final attempt to try to vectorize all instructions with the

    // same/alternate ops only, this may result in some extra final

    // vectorization.

    if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {

      // Success start over because instructions might have been changed.

      Changed = true;

      VL.swap(Candidates);

      Candidates.clear();

      for (T *V : VL) {

        if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))

          Candidates.push_back(V);

      }

    } else {

      /// \Returns the minimum number of elements that we will attempt to

      /// vectorize.

      auto GetMinNumElements = [&R](Value *V) {

        unsigned EltSize = R.getVectorElementSize(V);

        return std::max(2U, R.getMaxVecRegSize() / EltSize);

      };

      if (NumElts < GetMinNumElements(*IncIt) &&

          (Candidates.empty() ||

           Candidates.front()->getType() == (*IncIt)->getType())) {

        for (T *V : VL) {

          if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))

            Candidates.push_back(V);

        }

      }

    }

    // Final attempt to vectorize instructions with the same types.

    if (Candidates.size() > 1 &&

        (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {

      if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {

        // Success start over because instructions might have been changed.

        Changed = true;

      } else if (MaxVFOnly) {

        // Try to vectorize using small vectors.

        SmallVector<T *> VL;

        for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;

             VL.clear()) {

          auto *I = dyn_cast<Instruction>(*It);

          if (!I || R.isDeleted(I)) {

            ++It;

            continue;

          }

          auto *SameTypeIt = It;

          while (SameTypeIt != End &&

                 (!isa<Instruction>(*SameTypeIt) ||

                  R.isDeleted(cast<Instruction>(*SameTypeIt)) ||

                  AreCompatible(*SameTypeIt, *It))) {

            auto *I = dyn_cast<Instruction>(*SameTypeIt);

            ++SameTypeIt;

            if (I && !R.isDeleted(I))

              VL.push_back(cast<T>(I));

          }

          unsigned NumElts = VL.size();

          if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),

                                                  /*MaxVFOnly=*/false))

            Changed = true;

          It = SameTypeIt;

        }

      }

      Candidates.clear();

    }


    // Start over at the next instruction of a different type (or the end).

    IncIt = SameTypeIt;

  }

  return Changed;

}


/// Compare two cmp instructions. If IsCompatibility is true, function returns

/// true if 2 cmps have same/swapped predicates and mos compatible corresponding

/// operands. If IsCompatibility is false, function implements strict weak

/// ordering relation between two cmp instructions, returning true if the first

/// instruction is "less" than the second, i.e. its predicate is less than the

/// predicate of the second or the operands IDs are less than the operands IDs

/// of the second cmp instruction.

template <bool IsCompatibility>

static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,

                       const DominatorTree &DT) {

  assert(isValidElementType(V->getType()) &&

         isValidElementType(V2->getType()) &&

         "Expected valid element types only.");

  if (V == V2)

    return IsCompatibility;

  auto *CI1 = cast<CmpInst>(V);

  auto *CI2 = cast<CmpInst>(V2);

  if (CI1->getOperand(0)->getType()->getTypeID() <

      CI2->getOperand(0)->getType()->getTypeID())

    return !IsCompatibility;

  if (CI1->getOperand(0)->getType()->getTypeID() >

      CI2->getOperand(0)->getType()->getTypeID())

    return false;

  if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <

      CI2->getOperand(0)->getType()->getScalarSizeInBits())

    return !IsCompatibility;

  if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >

      CI2->getOperand(0)->getType()->getScalarSizeInBits())

    return false;

  CmpInst::Predicate Pred1 = CI1->getPredicate();

  CmpInst::Predicate Pred2 = CI2->getPredicate();

  CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);

  CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);

  CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);

  CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);

  if (BasePred1 < BasePred2)

    return !IsCompatibility;

  if (BasePred1 > BasePred2)

    return false;

  // Compare operands.

  bool CI1Preds = Pred1 == BasePred1;

  bool CI2Preds = Pred2 == BasePred1;

  for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {

    auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);

    auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);

    if (Op1 == Op2)

      continue;

    if (Op1->getValueID() < Op2->getValueID())

      return !IsCompatibility;

    if (Op1->getValueID() > Op2->getValueID())

      return false;

    if (auto *I1 = dyn_cast<Instruction>(Op1))

      if (auto *I2 = dyn_cast<Instruction>(Op2)) {

        if (IsCompatibility) {

          if (I1->getParent() != I2->getParent())

            return false;

        } else {

          // Try to compare nodes with same parent.

          DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());

          DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());

          if (!NodeI1)

            return NodeI2 != nullptr;

          if (!NodeI2)

            return false;

          assert((NodeI1 == NodeI2) ==

                     (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

                 "Different nodes should have different DFS numbers");

          if (NodeI1 != NodeI2)

            return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

        }

        InstructionsState S = getSameOpcode({I1, I2}, TLI);

        if (S && (IsCompatibility || !S.isAltShuffle()))

          continue;

        if (IsCompatibility)

          return false;

        if (I1->getOpcode() != I2->getOpcode())

          return I1->getOpcode() < I2->getOpcode();

      }

  }

  return IsCompatibility;

}


template <typename ItT>

bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,

                                          BasicBlock *BB, BoUpSLP &R) {

  bool Changed = false;

  // Try to find reductions first.

  for (CmpInst *I : CmpInsts) {

    if (R.isDeleted(I))

      continue;

    for (Value *Op : I->operands())

      if (auto *RootOp = dyn_cast<Instruction>(Op)) {

        Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);

        if (R.isDeleted(I))

          break;

      }

  }

  // Try to vectorize operands as vector bundles.

  for (CmpInst *I : CmpInsts) {

    if (R.isDeleted(I))

      continue;

    Changed |= tryToVectorize(I, R);

  }

  // Try to vectorize list of compares.

  // Sort by type, compare predicate, etc.

  auto CompareSorter = [&](Value *V, Value *V2) {

    if (V == V2)

      return false;

    return compareCmp<false>(V, V2, *TLI, *DT);

  };


  auto AreCompatibleCompares = [&](Value *V1, Value *V2) {

    if (V1 == V2)

      return true;

    return compareCmp<true>(V1, V2, *TLI, *DT);

  };


  SmallVector<Value *> Vals;

  for (Instruction *V : CmpInsts)

    if (!R.isDeleted(V) && isValidElementType(getValueType(V)))

      Vals.push_back(V);

  if (Vals.size() <= 1)

    return Changed;

  Changed |= tryToVectorizeSequence<Value>(

      Vals, CompareSorter, AreCompatibleCompares,

      [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {

        // Exclude possible reductions from other blocks.

        bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {

          return any_of(V->users(), [V](User *U) {

            auto *Select = dyn_cast<SelectInst>(U);

            return Select &&

                   Select->getParent() != cast<Instruction>(V)->getParent();

          });

        });

        if (ArePossiblyReducedInOtherBlock)

          return false;

        return tryToVectorizeList(Candidates, R, MaxVFOnly);

      },

      /*MaxVFOnly=*/true, R);

  return Changed;

}


bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,

                                         BasicBlock *BB, BoUpSLP &R) {

  assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&

         "This function only accepts Insert instructions");

  bool OpsChanged = false;

  SmallVector<WeakTrackingVH> PostponedInsts;

  for (auto *I : reverse(Instructions)) {

    // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.

    if (R.isDeleted(I) || isa<CmpInst>(I))

      continue;

    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

      OpsChanged |=

          vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);

    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

      OpsChanged |=

          vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);

    }

    // pass2 - try to vectorize reductions only

    if (R.isDeleted(I))

      continue;

    OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);

    if (R.isDeleted(I) || isa<CmpInst>(I))

      continue;

    // pass3 - try to match and vectorize a buildvector sequence.

    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

      OpsChanged |=

          vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);

    } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,

                                               /*MaxVFOnly=*/false);

    }

  }

  // Now try to vectorize postponed instructions.

  OpsChanged |= tryToVectorize(PostponedInsts, R);


  Instructions.clear();

  return OpsChanged;

}


bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

  bool Changed = false;

  SmallVector<Value *, 4> Incoming;

  SmallPtrSet<Value *, 16> VisitedInstrs;

  // Maps phi nodes to the non-phi nodes found in the use tree for each phi

  // node. Allows better to identify the chains that can be vectorized in the

  // better way.

  DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;

  auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {

    assert(isValidElementType(V1->getType()) &&

           isValidElementType(V2->getType()) &&

           "Expected vectorizable types only.");

    if (V1 == V2)

      return false;

    // It is fine to compare type IDs here, since we expect only vectorizable

    // types, like ints, floats and pointers, we don't care about other type.

    if (V1->getType()->getTypeID() < V2->getType()->getTypeID())

      return true;

    if (V1->getType()->getTypeID() > V2->getType()->getTypeID())

      return false;

    if (V1->getType()->getScalarSizeInBits() <

        V2->getType()->getScalarSizeInBits())

      return true;

    if (V1->getType()->getScalarSizeInBits() >

        V2->getType()->getScalarSizeInBits())

      return false;

    ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

    ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

    if (Opcodes1.size() < Opcodes2.size())

      return true;

    if (Opcodes1.size() > Opcodes2.size())

      return false;

    for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

      {

        // Instructions come first.

        auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);

        auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);

        if (I1 && I2) {

          DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

          DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());

          if (!NodeI1)

            return NodeI2 != nullptr;

          if (!NodeI2)

            return false;

          assert((NodeI1 == NodeI2) ==

                     (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

                 "Different nodes should have different DFS numbers");

          if (NodeI1 != NodeI2)

            return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

          InstructionsState S = getSameOpcode({I1, I2}, *TLI);

          if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {

            const auto *E1 = dyn_cast<ExtractElementInst>(I1);

            const auto *E2 = dyn_cast<ExtractElementInst>(I2);

            if (!E1 || !E2)

              continue;


            // Sort on ExtractElementInsts primarily by vector operands. Prefer

            // program order of the vector operands.

            const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());

            const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());

            if (V1 != V2) {

              if (V1 && !V2)

                return true;

              if (!V1 && V2)

                return false;

              DomTreeNodeBase<BasicBlock> *NodeI1 =

                  DT->getNode(V1->getParent());

              DomTreeNodeBase<BasicBlock> *NodeI2 =

                  DT->getNode(V2->getParent());

              if (!NodeI1)

                return NodeI2 != nullptr;

              if (!NodeI2)

                return false;

              assert((NodeI1 == NodeI2) ==

                         (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

                     "Different nodes should have different DFS numbers");

              if (NodeI1 != NodeI2)

                return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

              return V1->comesBefore(V2);

            }

            // If we have the same vector operand, try to sort by constant

            // index.

            std::optional<unsigned> Id1 = getExtractIndex(E1);

            std::optional<unsigned> Id2 = getExtractIndex(E2);

            // Bring constants to the top

            if (Id1 && !Id2)

              return true;

            if (!Id1 && Id2)

              return false;

            // First elements come first.

            if (Id1 && Id2)

              return *Id1 < *Id2;


            continue;

          }

          if (I1->getOpcode() == I2->getOpcode())

            continue;

          return I1->getOpcode() < I2->getOpcode();

        }

        if (I1)

          return true;

        if (I2)

          return false;

      }

      {

        // Non-undef constants come next.

        bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);

        bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);

        if (C1 && C2)

          continue;

        if (C1)

          return true;

        if (C2)

          return false;

      }

      bool U1 = isa<UndefValue>(Opcodes1[I]);

      bool U2 = isa<UndefValue>(Opcodes2[I]);

      {

        // Non-constant non-instructions come next.

        if (!U1 && !U2) {

          auto ValID1 = Opcodes1[I]->getValueID();

          auto ValID2 = Opcodes2[I]->getValueID();

          if (ValID1 == ValID2)

            continue;

          if (ValID1 < ValID2)

            return true;

          if (ValID1 > ValID2)

            return false;

        }

        if (!U1)

          return true;

        if (!U2)

          return false;

      }

      // Undefs come last.

      assert(U1 && U2 && "The only thing left should be undef & undef.");

    }

    return false;

  };

  auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {

    if (V1 == V2)

      return true;

    if (V1->getType() != V2->getType())

      return false;

    ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

    ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

    if (Opcodes1.size() != Opcodes2.size())

      return false;

    for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

      // Undefs are compatible with any other value.

      if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

        continue;

      if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

        if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

          if (R.isDeleted(I1) || R.isDeleted(I2))

            return false;

          if (I1->getParent() != I2->getParent())

            return false;

          if (getSameOpcode({I1, I2}, *TLI))

            continue;

          return false;

        }

      if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

        continue;

      if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())

        return false;

    }

    return true;

  };


  bool HaveVectorizedPhiNodes = false;

  do {

    // Collect the incoming values from the PHIs.

    Incoming.clear();

    for (Instruction &I : *BB) {

      auto *P = dyn_cast<PHINode>(&I);

      if (!P || P->getNumIncomingValues() > MaxPHINumOperands)

        break;


      // No need to analyze deleted, vectorized and non-vectorizable

      // instructions.

      if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&

          isValidElementType(P->getType()))

        Incoming.push_back(P);

    }


    if (Incoming.size() <= 1)

      break;


    // Find the corresponding non-phi nodes for better matching when trying to

    // build the tree.

    for (Value *V : Incoming) {

      SmallVectorImpl<Value *> &Opcodes =

          PHIToOpcodes.try_emplace(V).first->getSecond();

      if (!Opcodes.empty())

        continue;

      SmallVector<Value *, 4> Nodes(1, V);

      SmallPtrSet<Value *, 4> Visited;

      while (!Nodes.empty()) {

        auto *PHI = cast<PHINode>(Nodes.pop_back_val());

        if (!Visited.insert(PHI).second)

          continue;

        for (Value *V : PHI->incoming_values()) {

          if (auto *PHI1 = dyn_cast<PHINode>((V))) {

            Nodes.push_back(PHI1);

            continue;

          }

          Opcodes.emplace_back(V);

        }

      }

    }


    HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(

        Incoming, PHICompare, AreCompatiblePHIs,

        [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {

          return tryToVectorizeList(Candidates, R, MaxVFOnly);

        },

        /*MaxVFOnly=*/true, R);

    Changed |= HaveVectorizedPhiNodes;

    if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {

          auto *PHI = dyn_cast<PHINode>(P.first);

          return !PHI || R.isDeleted(PHI);

        }))

      PHIToOpcodes.clear();

    VisitedInstrs.insert_range(Incoming);

  } while (HaveVectorizedPhiNodes);


  VisitedInstrs.clear();


  InstSetVector PostProcessInserts;

  SmallSetVector<CmpInst *, 8> PostProcessCmps;

  // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true

  // also vectorizes `PostProcessCmps`.

  auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {

    bool Changed = vectorizeInserts(PostProcessInserts, BB, R);

    if (VectorizeCmps) {

      Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);

      PostProcessCmps.clear();

    }

    PostProcessInserts.clear();

    return Changed;

  };

  // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.

  auto IsInPostProcessInstrs = [&](Instruction *I) {

    if (auto *Cmp = dyn_cast<CmpInst>(I))

      return PostProcessCmps.contains(Cmp);

    return isa<InsertElementInst, InsertValueInst>(I) &&

           PostProcessInserts.contains(I);

  };

  // Returns true if `I` is an instruction without users, like terminator, or

  // function call with ignored return value, store. Ignore unused instructions

  // (basing on instruction type, except for CallInst and InvokeInst).

  auto HasNoUsers = [](Instruction *I) {

    return I->use_empty() &&

           (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));

  };

  for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {

    // Skip instructions with scalable type. The num of elements is unknown at

    // compile-time for scalable type.

    if (isa<ScalableVectorType>(It->getType()))

      continue;


    // Skip instructions marked for the deletion.

    if (R.isDeleted(&*It))

      continue;

    // We may go through BB multiple times so skip the one we have checked.

    if (!VisitedInstrs.insert(&*It).second) {

      if (HasNoUsers(&*It) &&

          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {

        // We would like to start over since some instructions are deleted

        // and the iterator may become invalid value.

        Changed = true;

        It = BB->begin();

        E = BB->end();

      }

      continue;

    }


    // Try to vectorize reductions that use PHINodes.

    if (PHINode *P = dyn_cast<PHINode>(It)) {

      // Check that the PHI is a reduction PHI.

      if (P->getNumIncomingValues() == 2) {

        // Try to match and vectorize a horizontal reduction.

        Instruction *Root = getReductionInstr(DT, P, BB, LI);

        if (Root && vectorizeRootInstruction(P, Root, BB, R)) {

          Changed = true;

          It = BB->begin();

          E = BB->end();

          continue;

        }

      }

      // Try to vectorize the incoming values of the PHI, to catch reductions

      // that feed into PHIs.

      for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {

        // Skip if the incoming block is the current BB for now. Also, bypass

        // unreachable IR for efficiency and to avoid crashing.

        // TODO: Collect the skipped incoming values and try to vectorize them

        // after processing BB.

        if (BB == P->getIncomingBlock(I) ||

            !DT->isReachableFromEntry(P->getIncomingBlock(I)))

          continue;


        // Postponed instructions should not be vectorized here, delay their

        // vectorization.

        if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));

            PI && !IsInPostProcessInstrs(PI)) {

          bool Res =

              vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);

          Changed |= Res;

          if (Res && R.isDeleted(P)) {

            It = BB->begin();

            E = BB->end();

            break;

          }

        }

      }

      continue;

    }


    if (HasNoUsers(&*It)) {

      bool OpsChanged = false;

      auto *SI = dyn_cast<StoreInst>(It);

      bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;

      if (SI) {

        auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

        // Try to vectorize chain in store, if this is the only store to the

        // address in the block.

        // TODO: This is just a temporarily solution to save compile time. Need

        // to investigate if we can safely turn on slp-vectorize-hor-store

        // instead to allow lookup for reduction chains in all non-vectorized

        // stores (need to check side effects and compile time).

        TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&

                              SI->getValueOperand()->hasOneUse();

      }

      if (TryToVectorizeRoot) {

        for (auto *V : It->operand_values()) {

          // Postponed instructions should not be vectorized here, delay their

          // vectorization.

          if (auto *VI = dyn_cast<Instruction>(V);

              VI && !IsInPostProcessInstrs(VI))

            // Try to match and vectorize a horizontal reduction.

            OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);

        }

      }

      // Start vectorization of post-process list of instructions from the

      // top-tree instructions to try to vectorize as many instructions as

      // possible.

      OpsChanged |=

          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());

      if (OpsChanged) {

        // We would like to start over since some instructions are deleted

        // and the iterator may become invalid value.

        Changed = true;

        It = BB->begin();

        E = BB->end();

        continue;

      }

    }


    if (isa<InsertElementInst, InsertValueInst>(It))

      PostProcessInserts.insert(&*It);

    else if (isa<CmpInst>(It))

      PostProcessCmps.insert(cast<CmpInst>(&*It));

  }


  return Changed;

}


bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {

  auto Changed = false;

  for (auto &Entry : GEPs) {

    // If the getelementptr list has fewer than two elements, there's nothing

    // to do.

    if (Entry.second.size() < 2)

      continue;


    LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "

                      << Entry.second.size() << ".\n");


    // Process the GEP list in chunks suitable for the target's supported

    // vector size. If a vector register can't hold 1 element, we are done. We

    // are trying to vectorize the index computations, so the maximum number of

    // elements is based on the size of the index expression, rather than the

    // size of the GEP itself (the target's pointer size).

    auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {

      return !R.isDeleted(GEP);

    });

    if (It == Entry.second.end())

      continue;

    unsigned MaxVecRegSize = R.getMaxVecRegSize();

    unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());

    if (MaxVecRegSize < EltSize)

      continue;


    unsigned MaxElts = MaxVecRegSize / EltSize;

    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {

      auto Len = std::min<unsigned>(BE - BI, MaxElts);

      ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);


      // Initialize a set a candidate getelementptrs. Note that we use a

      // SetVector here to preserve program order. If the index computations

      // are vectorizable and begin with loads, we want to minimize the chance

      // of having to reorder them later.

      SetVector<Value *> Candidates(llvm::from_range, GEPList);


      // Some of the candidates may have already been vectorized after we

      // initially collected them or their index is optimized to constant value.

      // If so, they are marked as deleted, so remove them from the set of

      // candidates.

      Candidates.remove_if([&R](Value *I) {

        return R.isDeleted(cast<Instruction>(I)) ||

               isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());

      });


      // Remove from the set of candidates all pairs of getelementptrs with

      // constant differences. Such getelementptrs are likely not good

      // candidates for vectorization in a bottom-up phase since one can be

      // computed from the other. We also ensure all candidate getelementptr

      // indices are unique.

      for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {

        auto *GEPI = GEPList[I];

        if (!Candidates.count(GEPI))

          continue;

        const SCEV *SCEVI = SE->getSCEV(GEPList[I]);

        for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {

          auto *GEPJ = GEPList[J];

          const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);

          if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {

            Candidates.remove(GEPI);

            Candidates.remove(GEPJ);

          } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {

            Candidates.remove(GEPJ);

          }

        }

      }


      // We break out of the above computation as soon as we know there are

      // fewer than two candidates remaining.

      if (Candidates.size() < 2)

        continue;


      // Add the single, non-constant index of each candidate to the bundle. We

      // ensured the indices met these constraints when we originally collected

      // the getelementptrs.

      SmallVector<Value *, 16> Bundle(Candidates.size());

      auto BundleIndex = 0u;

      for (auto *V : Candidates) {

        auto *GEP = cast<GetElementPtrInst>(V);

        auto *GEPIdx = GEP->idx_begin()->get();

        assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));

        Bundle[BundleIndex++] = GEPIdx;

      }


      // Try and vectorize the indices. We are currently only interested in

      // gather-like cases of the form:

      //

      // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...

      //

      // where the loads of "a", the loads of "b", and the subtractions can be

      // performed in parallel. It's likely that detecting this pattern in a

      // bottom-up phase will be simpler and less costly than building a

      // full-blown top-down phase beginning at the consecutive loads.

      Changed |= tryToVectorizeList(Bundle, R);

    }

  }

  return Changed;

}


bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

  bool Changed = false;

  // Sort by type, base pointers and values operand. Value operands must be

  // compatible (have the same opcode, same parent), otherwise it is

  // definitely not profitable to try to vectorize them.

  auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {

    if (V->getValueOperand()->getType()->getTypeID() <

        V2->getValueOperand()->getType()->getTypeID())

      return true;

    if (V->getValueOperand()->getType()->getTypeID() >

        V2->getValueOperand()->getType()->getTypeID())

      return false;

    if (V->getPointerOperandType()->getTypeID() <

        V2->getPointerOperandType()->getTypeID())

      return true;

    if (V->getPointerOperandType()->getTypeID() >

        V2->getPointerOperandType()->getTypeID())

      return false;

    if (V->getValueOperand()->getType()->getScalarSizeInBits() <

        V2->getValueOperand()->getType()->getScalarSizeInBits())

      return true;

    if (V->getValueOperand()->getType()->getScalarSizeInBits() >

        V2->getValueOperand()->getType()->getScalarSizeInBits())

      return false;

    // UndefValues are compatible with all other values.

    if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))

      if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

        DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =

            DT->getNode(I1->getParent());

        DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =

            DT->getNode(I2->getParent());

        assert(NodeI1 && "Should only process reachable instructions");

        assert(NodeI2 && "Should only process reachable instructions");

        assert((NodeI1 == NodeI2) ==

                   (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

               "Different nodes should have different DFS numbers");

        if (NodeI1 != NodeI2)

          return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

        return I1->getOpcode() < I2->getOpcode();

      }

    return V->getValueOperand()->getValueID() <

           V2->getValueOperand()->getValueID();

  };


  auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {

    if (V1 == V2)

      return true;

    if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())

      return false;

    if (V1->getPointerOperandType() != V2->getPointerOperandType())

      return false;

    // Undefs are compatible with any other value.

    if (isa<UndefValue>(V1->getValueOperand()) ||

        isa<UndefValue>(V2->getValueOperand()))

      return true;

    if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))

      if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

        if (I1->getParent() != I2->getParent())

          return false;

        return getSameOpcode({I1, I2}, *TLI).valid();

      }

    if (isa<Constant>(V1->getValueOperand()) &&

        isa<Constant>(V2->getValueOperand()))

      return true;

    return V1->getValueOperand()->getValueID() ==

           V2->getValueOperand()->getValueID();

  };


  // Attempt to sort and vectorize each of the store-groups.

  DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;

  for (auto &Pair : Stores) {

    if (Pair.second.size() < 2)

      continue;


    LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "

                      << Pair.second.size() << ".\n");


    if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))

      continue;


    // Reverse stores to do bottom-to-top analysis. This is important if the

    // values are stores to the same addresses several times, in this case need

    // to follow the stores order (reversed to meet the memory dependecies).

    SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),

                                            Pair.second.rend());

    Changed |= tryToVectorizeSequence<StoreInst>(

        ReversedStores, StoreSorter, AreCompatibleStores,

        [&](ArrayRef<StoreInst *> Candidates, bool) {

          return vectorizeStores(Candidates, R, Attempted);

        },

        /*MaxVFOnly=*/false, R);

  }

  return Changed;

}

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

isConstant
static bool isConstant(const MachineInstr &MI)
Definition: AMDGPUInstructionSelector.cpp:2877

Select
AMDGPU Register Bank Select
Definition: AMDGPURegBankSelect.cpp:68

PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:98

Ignore
ReachingDefAnalysis InstSet InstSet & Ignore
Definition: ARMLowOverheadLoops.cpp:529

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

Results
Function Alias Analysis Results
Definition: AliasAnalysis.cpp:722

AliasAnalysis.h

AssumptionCache.h

Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...

getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:882

LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42

Analysis
block Block Frequency Analysis
Definition: BlockFrequencyInfo.cpp:300

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

Casting.h

CodeMetrics.h

CommandLine.h

Compiler.h

LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:638

ConstantFolding.h

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

IntrinsicCost
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))

DOTGraphTraits.h

getElementIndex
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:919

DataLayout.h

Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition: DeadArgumentElimination.cpp:347

DebugCounter.h
This file provides an implementation of debug counters.

DEBUG_COUNTER
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:194

DemandedBits.h

DenseMap.h
This file defines the DenseMap class.

DenseSet.h
This file defines the DenseSet and SmallDenseSet classes.

DerivedTypes.h

Dominators.h

Name
std::string Name
Definition: ELFObjHandler.cpp:77

Index
uint32_t Index
Definition: ELFObjHandler.cpp:83

Size
uint64_t Size
Definition: ELFObjHandler.cpp:81

End
bool End
Definition: ELF_riscv.cpp:480

Blocks
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507

Converter
Early If Converter
Definition: EarlyIfConversion.cpp:846

X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

runImpl
static bool runImpl(Function &F, const TargetLowering &TLI)
Definition: ExpandFp.cpp:597

GlobalsModRef.h
This is the interface for a simple mod/ref and alias analysis over globals.

GraphWriter.h

Cleanup
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42

GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:164

_
#define _
Definition: HexagonMCCodeEmitter.cpp:47

IRBuilder.h

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:110

BasicBlock.h

Constant.h

Function.h

Instruction.h

IntrinsicInst.h

Module.h
Module.h This file contains the declarations for the Module class.

Operator.h

Type.h

Use.h
This defines the Use class.

User.h

Value.h

IVDescriptors.h

Users
iv Induction Variable Users
Definition: IVUsers.cpp:48

InjectTLIMappings.h

InstrTypes.h

InstructionCost.h
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...

Instructions.h

Intrinsics.h

KnownBits.h

eraseInstruction
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1451

Loads.h

LoopAccessAnalysis.h

LoopInfo.h

LoopUtils.h

isSplat
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
Definition: LowerMatrixIntrinsics.cpp:110

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74

MathExtras.h

MemoryLocation.h
This file provides utility analysis objects describing memory locations.

Unknown
@ Unknown
Definition: NVPTXISelLowering.cpp:5389

OpIdx
MachineInstr unsigned OpIdx
Definition: NVPTXPrologEpilogPass.cpp:56

II
uint64_t IntrinsicInst * II
Definition: NVVMIntrRange.cpp:46

OptimizationRemarkEmitter.h

P
#define P(N)

verify
ppc ctr loops verify
Definition: PPCCTRLoopsVerify.cpp:71

IsSelect
static bool IsSelect(MachineInstr &MI)
Definition: PPCISelLowering.cpp:13782

if
if(PassOpts->AAPipeline)
Definition: PassBuilderBindings.cpp:64

Pass.h

PatternMatch.h

PriorityQueue.h
This file defines the PriorityQueue class.

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition: RISCVRedundantCopyElimination.cpp:71

isValid
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Definition: RustDemangle.cpp:181

isLoadCombineCandidateImpl
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
Definition: SLPVectorizer.cpp:15173

RunSLPVectorization
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))

isAlternateInstruction
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
Definition: SLPVectorizer.cpp:11970

getWidenedType
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
Definition: SLPVectorizer.cpp:274

isVectorLikeInstWithConstOps
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
Definition: SLPVectorizer.cpp:428

calculateRtStride
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
Definition: SLPVectorizer.cpp:6335

SplitAlternateInstructions
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))

isStridedLoad
static bool isStridedLoad(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, const bool IsAnyPointerUsedOutGraph, const int64_t Diff)
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
Definition: SLPVectorizer.cpp:6787

isRepeatedNonIdentityClusteredMask
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
Definition: SLPVectorizer.cpp:7705

isMaskedLoadCompress
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
Definition: SLPVectorizer.cpp:6628

MaxPHINumOperands
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
Definition: SLPVectorizer.cpp:233

VectorizeCopyableElements
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.

MaxVectorRegSizeOption
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))

isCommutative
static bool isCommutative(Instruction *I, Value *ValWithUses)
Definition: SLPVectorizer.cpp:530

allSameOpcode
static bool allSameOpcode(ArrayRef< Value * > VL)
Definition: SLPVectorizer.cpp:645

canConvertToFMA
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
Definition: SLPVectorizer.cpp:12637

MaxProfitableLoadStride
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))

SV_NAME
#define SV_NAME
Definition: SLPVectorizer.cpp:110

findBuildAggregate
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
Definition: SLPVectorizer.cpp:25130

clusterSortPtrAccesses
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Definition: SLPVectorizer.cpp:7158

getNumElements
static unsigned getNumElements(Type *Ty)
Definition: SLPVectorizer.cpp:265

buildUseMask
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
Definition: SLPVectorizer.cpp:682

areCompatibleCmpOps
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
Definition: SLPVectorizer.cpp:1429

createInsertVector
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
Definition: SLPVectorizer.cpp:6555

findBuildAggregateRec
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
Definition: SLPVectorizer.cpp:25091

getNumElems
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
Definition: SLPVectorizer.cpp:453

getShuffleCost
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
Definition: SLPVectorizer.cpp:6456

isSimple
static bool isSimple(Instruction *I)
Definition: SLPVectorizer.cpp:1704

MinScheduleRegionSize
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
Definition: SLPVectorizer.cpp:230

MinProfitableStridedLoads
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))

isFirstInsertElement
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
Definition: SLPVectorizer.cpp:15678

LookAheadMaxDepth
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))

MaxVFOption
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))

reorderReuses
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
Definition: SLPVectorizer.cpp:6047

combineOrders
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
Definition: SLPVectorizer.cpp:7745

MaxMemDepDistance
static const unsigned MaxMemDepDistance
Definition: SLPVectorizer.cpp:226

ViewSLPTree
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))

doesInTreeUserNeedToExtract
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
Definition: SLPVectorizer.cpp:1666

getDebugLocFromPHI
static DebugLoc getDebugLocFromPHI(PHINode &PN)
Definition: SLPVectorizer.cpp:18825

getExtractIndex
static std::optional< unsigned > getExtractIndex(const Instruction *E)
Definition: SLPVectorizer.cpp:860

VectorizeNonPowerOf2
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))

MinTreeSize
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))

reorderOrder
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
Definition: SLPVectorizer.cpp:6061

getFullVectorNumberOfElements
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
Definition: SLPVectorizer.cpp:282

isMainInstruction
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
Definition: SLPVectorizer.cpp:11964

performExtractsShuffleAction
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
Definition: SLPVectorizer.cpp:15733

ShouldVectorizeHor
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))

isConstant
static bool isConstant(Value *V)
Definition: SLPVectorizer.cpp:421

isSplat
static bool isSplat(ArrayRef< Value * > VL)
Definition: SLPVectorizer.cpp:504

SLPCostThreshold
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))

getPartNumElems
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
Definition: SLPVectorizer.cpp:446

allConstant
static bool allConstant(ArrayRef< Value * > VL)
Definition: SLPVectorizer.cpp:496

UsesLimit
static constexpr int UsesLimit
Definition: SLPVectorizer.cpp:221

getElementIndex
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
Definition: SLPVectorizer.cpp:612

isReductionCandidate
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
Definition: SLPVectorizer.cpp:25262

checkTreeSizes
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
Definition: SLPVectorizer.cpp:22647

getShufflevectorNumGroups
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
Definition: SLPVectorizer.cpp:350

isCmpSameOrSwapped
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
Definition: SLPVectorizer.cpp:1443

SLPSkipEarlyProfitabilityCheck
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))

generateKeySubkey
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
Definition: SLPVectorizer.cpp:9532

ShouldStartVectorizeHorAtStore
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))

getNumberOfPotentiallyCommutativeOps
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
Definition: SLPVectorizer.cpp:576

getScalarizationOverhead
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
Definition: SLPVectorizer.cpp:6485

buildCompressMask
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
Definition: SLPVectorizer.cpp:6595

getVectorCallCosts
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
Definition: SLPVectorizer.cpp:9762

transformScalarShuffleIndiciesToVector
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
Definition: SLPVectorizer.cpp:311

SLPReVec
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))

replicateMask
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
Definition: SLPVectorizer.cpp:1793

buildIntrinsicArgTypes
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
Definition: SLPVectorizer.cpp:9737

RootLookAheadMaxDepth
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))

AliasedCheckLimit
static const unsigned AliasedCheckLimit
Definition: SLPVectorizer.cpp:217

getValueType
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
Definition: SLPVectorizer.cpp:254

shortBundleName
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
Definition: SLPVectorizer.cpp:460

dumpOrder
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
Definition: SLPVectorizer.cpp:8862

isValidElementType
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
Definition: SLPVectorizer.cpp:242

getReductionInstr
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
Definition: SLPVectorizer.cpp:25165

calculateShufflevectorMask
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
Definition: SLPVectorizer.cpp:402

allSameType
static bool allSameType(ArrayRef< Value * > VL)
Definition: SLPVectorizer.cpp:1659

getLocation
static MemoryLocation getLocation(Instruction *I)
Definition: SLPVectorizer.cpp:1695

allSameBlock
static bool allSameBlock(ArrayRef< Value * > VL)
Definition: SLPVectorizer.cpp:472

getFloorFullVectorNumberOfElements
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
Definition: SLPVectorizer.cpp:297

areTwoInsertFromSameBuildVector
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
Definition: SLPVectorizer.cpp:7288

arePointersCompatible
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
Definition: SLPVectorizer.cpp:6291

getGEPCosts
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
Definition: SLPVectorizer.cpp:12393

isUndefVector
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
Definition: SLPVectorizer.cpp:703

tryToVectorizeSequence
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
Definition: SLPVectorizer.cpp:25538

findInstructionWithOpcode
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
Definition: SLPVectorizer.cpp:1414

getExtractWithExtendCost
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
Definition: SLPVectorizer.cpp:6537

getSameOpcode
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
Definition: SLPVectorizer.cpp:1465

ScheduleRegionSizeBudget
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.

getMainAltOpsNoStateVL
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
Definition: SLPVectorizer.cpp:10300

tryGetSecondaryReductionRoot
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
Definition: SLPVectorizer.cpp:25235

getRdxKind
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
Definition: SLPVectorizer.cpp:25060

matchRdxBop
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
Definition: SLPVectorizer.cpp:25209

gatherPossiblyVectorizableLoads
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
Definition: SLPVectorizer.cpp:8917

MinVectorRegSizeOption
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))

isFixedVectorShuffle
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
Definition: SLPVectorizer.cpp:772

getAggregateSize
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
Definition: SLPVectorizer.cpp:25063

getInsertExtractIndex
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
Definition: SLPVectorizer.cpp:587

RecursionMaxDepth
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))

tryToFindDuplicates
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
Definition: SLPVectorizer.cpp:10342

computeCommonAlignment
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
Definition: SLPVectorizer.cpp:6310

addMask
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
Definition: SLPVectorizer.cpp:1717

fixupOrderingIndices
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
Definition: SLPVectorizer.cpp:1752

createExtractVector
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
Definition: SLPVectorizer.cpp:6585

getNonPhiOperand
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
Definition: SLPVectorizer.cpp:25253

getVectorInstrCost
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
Definition: SLPVectorizer.cpp:6518

getAltInstrMask
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
Definition: SLPVectorizer.cpp:1778

compareCmp
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
Definition: SLPVectorizer.cpp:25657

isReverseOrder
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
Definition: SLPVectorizer.cpp:6318

SLPVectorizer.h

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

OS
raw_pwrite_stream & OS
Definition: SampleProfWriter.cpp:51

contains
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:480

ScalarEvolutionExpander.h

ScalarEvolutionExpressions.h

ScalarEvolution.h

ScopeExit.h
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...

SetOperations.h
This file defines generic set operations that may be used on set's of different types,...

SetVector.h
This file implements a set that has insertion order iteration characteristics.

SmallBitVector.h
This file implements the SmallBitVector class.

SmallPtrSet.h
This file defines the SmallPtrSet class.

SmallSet.h
This file defines the SmallSet class.

SmallString.h
This file defines the SmallString class.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition: Debug.h:119

BlockSize
static const int BlockSize
Definition: TarWriter.cpp:33

Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:77

TargetLibraryInfo.h

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

Local.h

getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247

getOperands
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:210

ValueHandle.h

ValueTracking.h

VectorUtils.h

Verifier.h

RHS
Value * RHS
Definition: X86PartialReduction.cpp:74

LHS
Value * LHS
Definition: X86PartialReduction.cpp:73

IV
static const uint32_t IV[8]
Definition: blake3_impl.h:83

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator
Merges shuffle masks and emits final shuffle instruction, if required.
Definition: SLPVectorizer.cpp:13178

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::ShuffleCostEstimator
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
Definition: SLPVectorizer.cpp:13710

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Definition: SLPVectorizer.cpp:13887

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::resetForSameNode
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Definition: SLPVectorizer.cpp:13837

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Definition: SLPVectorizer.cpp:13845

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::needToDelay
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Definition: SLPVectorizer.cpp:13831

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::finalize
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Definition: SLPVectorizer.cpp:13988

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::gather
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
Definition: SLPVectorizer.cpp:13951

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::~ShuffleCostEstimator
~ShuffleCostEstimator()
Definition: SLPVectorizer.cpp:14089

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::createFreeze
InstructionCost createFreeze(InstructionCost Cost)
Definition: SLPVectorizer.cpp:13986

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Definition: SLPVectorizer.cpp:13869

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::adjustExtracts
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Definition: SLPVectorizer.cpp:13716

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Definition: SLPVectorizer.cpp:13904

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder
Merges shuffle masks and emits final shuffle instruction, if required.
Definition: SLPVectorizer.cpp:17712

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
Definition: SLPVectorizer.cpp:18069

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::addOrdered
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Definition: SLPVectorizer.cpp:18122

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::needToDelay
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Definition: SLPVectorizer.cpp:17978

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Definition: SLPVectorizer.cpp:18023

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::gather
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
Definition: SLPVectorizer.cpp:18127

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Definition: SLPVectorizer.cpp:18035

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::createFreeze
Value * createFreeze(Value *V)
Definition: SLPVectorizer.cpp:18134

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::ShuffleInstructionBuilder
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
Definition: SLPVectorizer.cpp:17838

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Definition: SLPVectorizer.cpp:18002

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::adjustExtracts
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Definition: SLPVectorizer.cpp:17842

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::~ShuffleInstructionBuilder
~ShuffleInstructionBuilder()
Definition: SLPVectorizer.cpp:18250

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::resetForSameNode
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Definition: SLPVectorizer.cpp:17995

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::finalize
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Definition: SLPVectorizer.cpp:18138

T

VectorType
Definition: ItaniumDemangle.h:1189

bool

llvm::AAManager
A manager for alias analyses.
Definition: AliasAnalysis.h:960

llvm::AAResults
A private abstract base class describing the concept of an individual alias analysis implementation.
Definition: AliasAnalysis.h:318

llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:78

llvm::APInt::getAllOnes
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234

llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1406

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540

llvm::APInt::setBit
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330

llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371

llvm::APInt::isZero
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380

llvm::APInt::urem
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1666

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488

llvm::APInt::ult
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111

llvm::APInt::clearAllBits
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1396

llvm::APInt::negate
void negate()
Negate this APInt in place.
Definition: APInt.h:1468

llvm::APInt::logBase2
unsigned logBase2() const
Definition: APInt.h:1761

llvm::APInt::setAllBits
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319

llvm::APInt::setBits
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367

llvm::APInt::isPowerOf2
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440

llvm::APInt::getZero
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200

llvm::APInt::isOne
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389

llvm::APInt::getBitsSetFrom
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286

llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239

llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255

llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:431

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41

llvm::ArrayRef::equals
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:183

llvm::ArrayRef::back
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:156

llvm::ArrayRef::take_front
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:224

llvm::ArrayRef::drop_front
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:200

llvm::ArrayRef::front
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:150

llvm::ArrayRef::end
iterator end() const
Definition: ArrayRef.h:136

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147

llvm::ArrayRef::drop_back
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:206

llvm::ArrayRef::begin
iterator begin() const
Definition: ArrayRef.h:135

llvm::ArrayRef::take_back
ArrayRef< T > take_back(size_t N=1) const
Return a copy of *this with only the last N elements.
Definition: ArrayRef.h:231

llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142

llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:191

llvm::ArrayRef::consume_front
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition: ArrayRef.h:162

llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:174

llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:43

llvm::Attribute::getWithAlignment
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234

llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:62

llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:472

llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:459

llvm::BasicBlock::reverse_iterator
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:172

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213

llvm::BasicBlock::rend
reverse_iterator rend()
Definition: BasicBlock.h:477

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170

llvm::BasicBlock::getFirstNonPHIOrDbgOrAlloca
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition: BasicBlock.cpp:406

llvm::BasicBlock::size
size_t size() const
Definition: BasicBlock.h:480

llvm::BasicBlock::const_reverse_iterator
InstListType::const_reverse_iterator const_reverse_iterator
Definition: BasicBlock.h:173

llvm::BasicBlock::isEHPad
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:707

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233

llvm::BatchAAResults
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
Definition: AliasAnalysis.h:656

llvm::BatchAAResults::getModRefInfo
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Definition: AliasAnalysis.h:679

llvm::BinaryOperator
Definition: InstrTypes.h:171

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73

llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116

llvm::CallBase::getBundleOperandsEndIndex
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2010

llvm::CallBase::getOperandBundlesAsDefs
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Definition: Instructions.cpp:509

llvm::CallBase::isNoBuiltin
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1905

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348

llvm::CallBase::hasIdenticalOperandBundleSchema
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2148

llvm::CallBase::getBundleOperandsStartIndex
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2004

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292

llvm::CallBase::getFunctionType
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1205

llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283

llvm::CallBase::arg_size
unsigned arg_size() const
Definition: InstrTypes.h:1290

llvm::CallBase::addParamAttr
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1506

llvm::CallBase::hasOperandBundles
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2001

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1510

llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:448

llvm::CmpInst
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:666

llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678

llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition: InstrTypes.h:711

llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707

llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708

llvm::CmpInst::ICMP_UGE
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702

llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701

llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705

llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703

llvm::CmpInst::ICMP_SGE
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706

llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704

llvm::CmpInst::BAD_FCMP_PREDICATE
@ BAD_FCMP_PREDICATE
Definition: InstrTypes.h:698

llvm::CmpInst::getSwappedPredicate
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:829

llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:791

llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:767

llvm::CmpPredicate
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23

llvm::ConstantExpr::getIntToPtr
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2314

llvm::ConstantExpr::getBinOpIdentity
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
Definition: Constants.cpp:2694

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:87

llvm::ConstantInt::getTrue
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:868

llvm::ConstantInt::getFalse
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:875

llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:163

llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154

llvm::ConstantVector::getSplat
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1474

llvm::ConstantVector::get
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423

llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:43

llvm::Constant::getAllOnesValue
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420

llvm::Constant::getNullValue
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:33

llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition: DWARFExpression.h:93

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63

llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:88

llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:124

llvm::DebugLoc::getUnknown
static DebugLoc getUnknown()
Definition: DebugLoc.h:162

llvm::DemandedBitsAnalysis
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:104

llvm::DemandedBits
Definition: DemandedBits.h:41

llvm::DemandedBits::getDemandedBits
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
Definition: DemandedBits.cpp:467

llvm::DenseMapBase::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:203

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:177

llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:245

llvm::DenseMapBase::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:319

llvm::DenseMapBase::size
unsigned size() const
Definition: DenseMap.h:120

llvm::DenseMapBase::empty
bool empty() const
Definition: DenseMap.h:119

llvm::DenseMapBase::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:173

llvm::DenseMapBase::end
iterator end()
Definition: DenseMap.h:87

llvm::DenseMapBase::at
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:221

llvm::DenseMapBase::contains
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:168

llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:230

llvm::DenseMapBase::clear
void clear()
Definition: DenseMap.h:131

llvm::DenseMap
Definition: DenseMap.h:730

llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:263

llvm::DomTreeNodeBase< BasicBlock >

llvm::DomTreeNodeBase::getDFSNumIn
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Definition: GenericDomTree.h:140

llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284

llvm::DominatorTreeBase::updateDFSNumbers
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
Definition: GenericDomTree.h:805

llvm::DominatorTreeBase::getNode
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Definition: GenericDomTree.h:401

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165

llvm::DominatorTree::isReachableFromEntry
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:334

llvm::DominatorTree::dominates
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:135

llvm::ElementCount
Definition: TypeSize.h:301

llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312

llvm::ExtractElementInst
This instruction extracts a single (scalar) element from a VectorType value.
Definition: Instructions.h:1808

llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2435

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22

llvm::FastMathFlags::set
void set()
Definition: FMF.h:61

llvm::FastMathFlags::allowReassoc
bool allowReassoc() const
Flag queries.
Definition: FMF.h:64

llvm::FastMathFlags::allowContract
bool allowContract() const
Definition: FMF.h:69

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:635

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803

llvm::FunctionType::params
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132

llvm::FunctionType::getReturnType
Type * getReturnType() const
Definition: DerivedTypes.h:126

llvm::Function
Definition: Function.h:64

llvm::Function::empty
bool empty() const
Definition: Function.h:857

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator++
nodes_iterator operator++()
Definition: SLPVectorizer.cpp:5952

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::nodes_iterator
nodes_iterator(const ItTy &It2)
Definition: SLPVectorizer.cpp:5950

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator*
NodeRef operator*()
Definition: SLPVectorizer.cpp:5951

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator!=
bool operator!=(const nodes_iterator &N2) const
Definition: SLPVectorizer.cpp:5956

llvm::IRBuilderBase::InsertPointGuard
Definition: IRBuilder.h:409

llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114

llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571

llvm::IRBuilderBase::getInt1Ty
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:547

llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559

llvm::IRBuilderBase::getIntNTy
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575

llvm::IRBuilderBase::CreateAlignedLoad
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864

llvm::IRBuilderBase::CreateMaskedLoad
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:488

llvm::IRBuilderBase::CreateSelect
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005

llvm::IRBuilderBase::GetInsertPoint
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:202

llvm::IRBuilderBase::CreateFreeze
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2637

llvm::IRBuilderBase::CreateCast
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2238

llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:201

llvm::IRBuilderBase::setFastMathFlags
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:345

llvm::IRBuilderBase::SetCurrentDebugLocation
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247

llvm::IRBuilderBase::CreateGEP
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923

llvm::IRBuilderBase::getAllOnesMask
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:862

llvm::IRBuilderBase::CreateUnOp
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1809

llvm::IRBuilderBase::CreateBinaryIntrinsic
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:823

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834

llvm::IRBuilderBase::getInt32
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522

llvm::IRBuilderBase::CreateCmp
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2463

llvm::IRBuilderBase::CreatePHI
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494

llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204

llvm::IRBuilderBase::CreateUnaryIntrinsic
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:815

llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593

llvm::IRBuilderBase::getFalse
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:507

llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2508

llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1708

llvm::IRBuilderBase::ClearInsertionPoint
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:196

llvm::IRBuilderBase::CreateIntCast
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2277

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207

llvm::IRBuilderBase::CreateAlignedStore
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1883

llvm::IRBuilderBase::CreateICmp
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2439

llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1651

llvm::IRBuilderBase::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1437

llvm::IRBuilderBase::CreateMaskedGather
LLVM_ABI CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:538

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780

llvm::InsertElementInst
This instruction inserts a single (scalar) element into a VectorType value.
Definition: Instructions.h:1867

llvm::InsertElementInst::getType
VectorType * getType() const
Overload to return most specific vector type.
Definition: Instructions.h:1895

llvm::InsertValueInst
This instruction inserts a struct field of array element value into an aggregate value.
Definition: Instructions.h:2523

llvm::InstructionCost
Definition: InstructionCost.h:30

llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition: InstructionCost.h:74

llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:80

llvm::Instruction
Definition: Instruction.h:69

llvm::Instruction::isCast
bool isCast() const
Definition: Instruction.h:321

llvm::Instruction::mayReadOrWriteMemory
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:808

llvm::Instruction::mayWriteToMemory
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
Definition: Instruction.cpp:1038

llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513

llvm::Instruction::moveAfter
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:196

llvm::Instruction::isCommutative
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Definition: Instruction.cpp:1273

llvm::Instruction::isBinaryOp
bool isBinaryOp() const
Definition: Instruction.h:317

llvm::Instruction::comesBefore
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:342

llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312

llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:998

llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:320

llvm::Instruction::isIdenticalTo
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
Definition: Instruction.cpp:942

llvm::Instruction::isIntDivRem
bool isIntDivRem() const
Definition: Instruction.h:318

llvm::Instruction::UnaryOps
UnaryOps
Definition: Instruction.h:991

llvm::Instruction::CastOps
CastOps
Definition: Instruction.h:1012

llvm::IntegerType::get
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319

llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:122

llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:161

llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:180

llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:259

llvm::LoadInst::isSimple
bool isSimple() const
Definition: Instructions.h:251

llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:215

llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:570

llvm::LoopBase::getLoopLatch
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: GenericLoopInfoImpl.h:256

llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: GenericLoopInfo.h:606

llvm::LoopInfo
Definition: LoopInfo.h:409

llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40

llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36

llvm::MapVector::end
iterator end()
Definition: MapVector.h:67

llvm::MapVector::takeVector
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:48

llvm::MapVector::find
iterator find(const KeyT &Key)
Definition: MapVector.h:141

llvm::MapVector::empty
bool empty() const
Definition: MapVector.h:75

llvm::MapVector::try_emplace
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:107

llvm::MapVector::lookup
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:99

llvm::MapVector::size
size_type size() const
Definition: MapVector.h:56

llvm::MapVector::front
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:79

llvm::MapVector::clear
void clear()
Definition: MapVector.h:84

llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition: IntrinsicInst.h:1108

llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:216

llvm::MemoryLocation::get
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition: MemoryLocation.cpp:35

llvm::MemoryLocation::Ptr
const Value * Ptr
The address of the start of the location.
Definition: MemoryLocation.h:224

llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303

llvm::MutableArrayRef::front
T & front() const
front - Get the first element.
Definition: ArrayRef.h:354

llvm::MutableArrayRef::end
iterator end() const
Definition: ArrayRef.h:348

llvm::MutableArrayRef::begin
iterator begin() const
Definition: ArrayRef.h:347

llvm::MutableArrayRef::slice
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:381

llvm::OptimizationRemarkEmitterAnalysis
Definition: OptimizationRemarkEmitter.h:168

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33

llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:809

llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:764

llvm::OwningArrayRef
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:454

llvm::PHINode
Definition: Instructions.h:2638

llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2773

llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2814

llvm::PHINode::getIncomingBlock
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Definition: Instructions.h:2733

llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition: Instructions.h:2709

llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:99

llvm::PointerType::getUnqual
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:720

llvm::PointerUnion
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118

llvm::PointerUnion::isNull
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142

llvm::PointerUnion::dyn_cast
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:168

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151

llvm::PriorityQueue
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28

llvm::RecurrenceDescriptor::getOpcode
unsigned getOpcode() const
Definition: IVDescriptors.h:227

llvm::RecurrenceDescriptor::isIntMinMaxRecurrenceKind
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
Definition: IVDescriptors.h:249

llvm::RecurrenceDescriptor::isMinMaxRecurrenceKind
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
Definition: IVDescriptors.h:263

llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:64

llvm::SCEVExpander::expandCodeFor
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
Definition: ScalarEvolutionExpander.cpp:1490

llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:72

llvm::SCEV::isZero
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
Definition: ScalarEvolution.cpp:445

llvm::SCEV::isNonConstantNegative
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition: ScalarEvolution.cpp:451

llvm::SCEV::getType
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:383

llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition: ScalarEvolution.h:2360

llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:448

llvm::ScalarEvolution::getConstant
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:470

llvm::ScalarEvolution::getSCEV
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4572

llvm::ScalarEvolution::forgetValue
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
Definition: ScalarEvolution.cpp:8563

llvm::ScalarEvolution::getMinusSCEV
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition: ScalarEvolution.cpp:4680

llvm::ScalarEvolution::getMulExpr
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:3117

llvm::ScalarEvolution::getUDivExactExpr
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:3613

llvm::ScalarEvolution::getAddExpr
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2523

llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1689

llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:59

llvm::SetVector::getArrayRef
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:90

llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:104

llvm::SetVector::front
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:149

llvm::SetVector::insert_range
void insert_range(Range &&R)
Definition: SetVector.h:193

llvm::SetVector::takeVector
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition: SetVector.h:93

llvm::SetVector::clear
void clear()
Completely clear the SetVector.
Definition: SetVector.h:284

llvm::SetVector::empty
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:99

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:168

llvm::SetVector::contains
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:269

llvm::ShuffleVectorInst
This instruction constructs a fixed permutation of two input vectors.
Definition: Instructions.h:1934

llvm::ShuffleVectorInst::isZeroEltSplatMask
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
Definition: Instructions.cpp:1975

llvm::ShuffleVectorInst::isOneUseSingleSourceMask
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
Definition: Instructions.cpp:2317

llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
Definition: Instructions.cpp:2443

llvm::ShuffleVectorInst::isIdentityMask
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
Definition: Instructions.cpp:1947

llvm::ShuffleVectorInst::isExtractSubvectorMask
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
Definition: Instructions.cpp:2074

llvm::ShuffleVectorInst::isReverseMask
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
Definition: Instructions.cpp:1955

llvm::ShuffleVectorInst::isInsertSubvectorMask
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
Definition: Instructions.cpp:2103

llvm::ShuffleVectorInst::isInterleaveMask
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Definition: Instructions.cpp:2359

llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition: SmallBitVector.h:35

llvm::SmallBitVector::find_first
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
Definition: SmallBitVector.h:230

llvm::SmallBitVector::set
SmallBitVector & set()
Definition: SmallBitVector.h:366

llvm::SmallBitVector::test
bool test(unsigned Idx) const
Definition: SmallBitVector.h:474

llvm::SmallBitVector::find_next
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
Definition: SmallBitVector.h:277

llvm::SmallBitVector::all
bool all() const
Returns true if all bits are set.
Definition: SmallBitVector.h:216

llvm::SmallBitVector::size
size_type size() const
Returns the number of bits in this bitvector.
Definition: SmallBitVector.h:195

llvm::SmallBitVector::any
bool any() const
Returns true if any bit is set.
Definition: SmallBitVector.h:209

llvm::SmallBitVector::count
size_type count() const
Returns the number of bits which are set.
Definition: SmallBitVector.h:200

llvm::SmallBitVector::reset
SmallBitVector & reset()
Definition: SmallBitVector.h:402

llvm::SmallBitVector::none
bool none() const
Returns true if none of the bits are set.
Definition: SmallBitVector.h:223

llvm::SmallDenseMap
Definition: DenseMap.h:889

llvm::SmallDenseSet
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283

llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:99

llvm::SmallPtrSetImplBase::clear
void clear()
Definition: SmallPtrSet.h:102

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition: SmallPtrSet.h:98

llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380

llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:418

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:470

llvm::SmallPtrSetImpl::end
iterator end() const
Definition: SmallPtrSet.h:499

llvm::SmallPtrSetImpl::insert_range
void insert_range(Range &&R)
Definition: SmallPtrSet.h:490

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401

llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:494

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476

llvm::SmallPtrSet< Value *, 16 >

llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:356

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134

llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:176

llvm::SmallSet::contains
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:227

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182

llvm::SmallSet::size
size_type size() const
Definition: SmallSet.h:171

llvm::SmallString
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26

llvm::SmallVectorBase::empty
bool empty() const
Definition: SmallVector.h:82

llvm::SmallVectorBase::size
size_t size() const
Definition: SmallVector.h:79

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition: SmallVector.h:674

llvm::SmallVectorImpl::assign
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705

llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938

llvm::SmallVectorImpl::reserve
void reserve(size_type N)
Definition: SmallVector.h:664

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684

llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:611

llvm::SmallVectorImpl::swap
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:969

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition: SmallVector.h:639

llvm::SmallVectorTemplateBase::pop_back
void pop_back()
Definition: SmallVector.h:426

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:414

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition: SmallVector.h:270

llvm::SmallVectorTemplateCommon::front
reference front()
Definition: SmallVector.h:300

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition: SmallVector.h:268

llvm::SmallVectorTemplateCommon::back
reference back()
Definition: SmallVector.h:309

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197

llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:296

llvm::StoreInst::getPointerOperandType
Type * getPointerOperandType() const
Definition: Instructions.h:389

llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:383

llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:386

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55

llvm::TargetFolder
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:35

llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:1979

llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition: TargetLibraryInfo.h:621

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:285

llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:219

llvm::TargetTransformInfo::getCastContextHint
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
Definition: TargetTransformInfo.cpp:1011

llvm::TargetTransformInfo::getStridedMemoryOpCost
LLVM_ABI InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
Definition: TargetTransformInfo.cpp:1200

llvm::TargetTransformInfo::getVectorInstrCost
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
Definition: TargetTransformInfo.cpp:1095

llvm::TargetTransformInfo::getScalarizationOverhead
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
Definition: TargetTransformInfo.cpp:631

llvm::TargetTransformInfo::isLegalMaskedLoad
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const
Return true if the target supports masked load.
Definition: TargetTransformInfo.cpp:474

llvm::TargetTransformInfo::preferAlternateOpcodeVectorization
LLVM_ABI bool preferAlternateOpcodeVectorization() const
Definition: TargetTransformInfo.cpp:1414

llvm::TargetTransformInfo::getCmpSelInstrCost
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
Definition: TargetTransformInfo.cpp:1083

llvm::TargetTransformInfo::getRegisterBitWidth
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
Definition: TargetTransformInfo.cpp:780

llvm::TargetTransformInfo::isLegalMaskedGather
LLVM_ABI bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
Definition: TargetTransformInfo.cpp:493

llvm::TargetTransformInfo::getMemoryOpCost
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
Definition: TargetTransformInfo.cpp:1160

llvm::TargetTransformInfo::getInterleavedMemoryOpCost
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
Definition: TargetTransformInfo.cpp:1209

llvm::TargetTransformInfo::getShuffleCost
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
Definition: TargetTransformInfo.cpp:986

llvm::TargetTransformInfo::getIntrinsicInstrCost
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:1221

llvm::TargetTransformInfo::getArithmeticReductionCost
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
Definition: TargetTransformInfo.cpp:1260

llvm::TargetTransformInfo::getCastInstrCost
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
Definition: TargetTransformInfo.cpp:1054

llvm::TargetTransformInfo::getGEPCost
LLVM_ABI InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
Definition: TargetTransformInfo.cpp:251

llvm::TargetTransformInfo::isLegalInterleavedAccessType
LLVM_ABI bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
Definition: TargetTransformInfo.cpp:534

llvm::TargetTransformInfo::isLegalBroadcastLoad
LLVM_ABI bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
Definition: TargetTransformInfo.cpp:488

llvm::TargetTransformInfo::getExtendedReductionCost
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
Definition: TargetTransformInfo.cpp:1278

llvm::TargetTransformInfo::getOperandInfo
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
Definition: TargetTransformInfo.cpp:885

llvm::TargetTransformInfo::getRegisterClassForType
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
Definition: TargetTransformInfo.cpp:771

llvm::TargetTransformInfo::forceScalarizeMaskedGather
LLVM_ABI bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
Definition: TargetTransformInfo.cpp:509

llvm::TargetTransformInfo::isLegalStridedLoadStore
LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
Definition: TargetTransformInfo.cpp:529

llvm::TargetTransformInfo::getMinMaxReductionCost
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Definition: TargetTransformInfo.cpp:1269

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:271

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:272

llvm::TargetTransformInfo::getArithmeticInstrCost
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition: TargetTransformInfo.cpp:950

llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:1151

llvm::TargetTransformInfo::OP_NegatedPowerOf2
@ OP_NegatedPowerOf2
Definition: TargetTransformInfo.h:1154

llvm::TargetTransformInfo::OP_None
@ OP_None
Definition: TargetTransformInfo.h:1152

llvm::TargetTransformInfo::OP_PowerOf2
@ OP_PowerOf2
Definition: TargetTransformInfo.h:1153

llvm::TargetTransformInfo::getPointersChainCost
LLVM_ABI InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
Definition: TargetTransformInfo.cpp:257

llvm::TargetTransformInfo::getMaximumVF
LLVM_ABI unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: TargetTransformInfo.cpp:811

llvm::TargetTransformInfo::isTypeLegal
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
Definition: TargetTransformInfo.cpp:586

llvm::TargetTransformInfo::getMaskedMemoryOpCost
LLVM_ABI InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Definition: TargetTransformInfo.cpp:1172

llvm::TargetTransformInfo::getCostOfKeepingLiveOverCall
LLVM_ABI InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
Definition: TargetTransformInfo.cpp:1292

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:1206

llvm::TargetTransformInfo::getMinVectorRegisterBitWidth
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
Definition: TargetTransformInfo.cpp:785

llvm::TargetTransformInfo::getGatherScatterOpCost
LLVM_ABI InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
Definition: TargetTransformInfo.cpp:1181

llvm::TargetTransformInfo::getNumberOfRegisters
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
Definition: TargetTransformInfo.cpp:762

llvm::TargetTransformInfo::isLegalAltInstr
LLVM_ABI bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
Definition: TargetTransformInfo.cpp:498

llvm::TargetTransformInfo::isFPVectorizationPotentiallyUnsafe
LLVM_ABI bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
Definition: TargetTransformInfo.cpp:683

llvm::TargetTransformInfo::getStoreMinimumVF
LLVM_ABI unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: TargetTransformInfo.cpp:816

llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:299

llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:297

llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:298

llvm::TargetTransformInfo::getInstructionCost
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
Definition: TargetTransformInfo.cpp:273

llvm::TargetTransformInfo::getNumberOfParts
LLVM_ABI unsigned getNumberOfParts(Type *Tp) const
Definition: TargetTransformInfo.cpp:1237

llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:1123

llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition: TargetTransformInfo.h:1130

llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:1126

llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:1134

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:1124

llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition: TargetTransformInfo.h:1132

llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:1125

llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition: TargetTransformInfo.h:1131

llvm::TargetTransformInfo::getCallInstrCost
LLVM_ABI InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
Definition: TargetTransformInfo.cpp:1229

llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition: TargetTransformInfo.h:1428

llvm::TargetTransformInfo::CastContextHint::Reversed
@ Reversed
The cast is used with a reversed load/store.

llvm::TargetTransformInfo::CastContextHint::Masked
@ Masked
The cast is used with a masked load/store.

llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.

llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.

llvm::TargetTransformInfo::CastContextHint::GatherScatter
@ GatherScatter
The cast is used with a gather/scatter.

llvm::TargetTransformInfo::getExtractWithExtendCost
LLVM_ABI InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:1065

llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:1143

llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition: TargetTransformInfo.h:1146

llvm::TargetTransformInfo::OK_UniformValue
@ OK_UniformValue
Definition: TargetTransformInfo.h:1145

llvm::TargetTransformInfo::OK_AnyValue
@ OK_AnyValue
Definition: TargetTransformInfo.h:1144

llvm::TargetTransformInfo::OK_NonUniformConstantValue
@ OK_NonUniformConstantValue
Definition: TargetTransformInfo.h:1147

llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82

llvm::TypeSize
Definition: TypeSize.h:335

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45

llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273

llvm::Type::isX86_FP80Ty
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159

llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:246

llvm::Type::isEmptyTy
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.

llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267

llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:296

llvm::Type::isPPC_FP128Ty
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128

llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184

llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:270

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240

llvm::Type::getTypeID
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136

llvm::Type::print
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.

llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139

llvm::Type::getStructNumElements
LLVM_ABI unsigned getStructNumElements() const

llvm::Type::getIntegerBitWidth
LLVM_ABI unsigned getIntegerBitWidth() const

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352

llvm::Type::getWithNewType
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.

llvm::UnaryOperator
Definition: InstrTypes.h:101

llvm::UndefValue::get
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35

llvm::User
Definition: User.h:44

llvm::User::operands
op_range operands()
Definition: User.h:292

llvm::User::replaceUsesOfWith
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21

llvm::User::User
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:119

llvm::User::op_begin
op_iterator op_begin()
Definition: User.h:284

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:232

llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:254

llvm::User::operand_values
iterator_range< value_op_iterator > operand_values()
Definition: User.h:316

llvm::VFDatabase
The Vector Function Database.
Definition: VectorUtils.h:33

llvm::VFDatabase::getMappings
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:74

llvm::Value
LLVM Value Representation.
Definition: Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256

llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546

llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:426

llvm::Value::getValueID
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:543

llvm::Value::hasNUsesOrMore
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:158

llvm::Value::hasNUses
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150

llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:346

llvm::Value::getContext
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1098

llvm::Value::getNumUses
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:265

llvm::Value::uses
iterator_range< use_iterator > uses()
Definition: Value.h:380

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322

llvm::Value::takeName
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396

llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430

llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695

llvm::VectorType::get
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.

llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:463

llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:205

llvm::cl::opt
Definition: CommandLine.h:1429

llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194

llvm::detail::DenseSetImpl::clear
void clear()
Definition: DenseSet.h:98

llvm::detail::DenseSetImpl::find
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:163

llvm::detail::DenseSetImpl::insert_range
void insert_range(Range &&R)
Definition: DenseSet.h:222

llvm::detail::DenseSetImpl::end
iterator end()
Definition: DenseSet.h:158

llvm::detail::DenseSetImpl::size
size_type size() const
Definition: DenseSet.h:87

llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:169

llvm::detail::DenseSetImpl::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:174

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203

llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:37

llvm::hash_code
An opaque object representing a hash code.
Definition: Hashing.h:76

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition: ilist_node.h:34

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:134

llvm::ilist_node_with_parent::getPrevNode
NodeTy * getPrevNode()
Definition: ilist_node.h:345

llvm::ilist_node_with_parent::getNextNode
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359

llvm::iterator_adaptor_base
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237

llvm::iterator_range
A range adaptor for a pair of iterators.
Definition: iterator_range.h:42

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53

llvm::raw_ostream::indent
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
Definition: raw_ostream.cpp:495

llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662

llvm::raw_svector_ostream
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:692

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics
A helper class used for scoring candidates for two consecutive lanes.
Definition: SLPVectorizer.cpp:2280

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveExtracts
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
Definition: SLPVectorizer.cpp:2318

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getShallowScore
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
Definition: SLPVectorizer.cpp:2340

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAllUserVectorized
static const int ScoreAllUserVectorized
Score if all users are vectorized.
Definition: SLPVectorizer.cpp:2334

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSameOpcode
static const int ScoreSameOpcode
Instructions with the same opcode.
Definition: SLPVectorizer.cpp:2324

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreUndef
static const int ScoreUndef
Matching with an undef is preferable to failing.
Definition: SLPVectorizer.cpp:2330

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getScoreAtLevelRec
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
Definition: SLPVectorizer.cpp:2521

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreFail
static const int ScoreFail
Score for failing to find a decent match.
Definition: SLPVectorizer.cpp:2332

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreMaskedGatherCandidate
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
Definition: SLPVectorizer.cpp:2316

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplat
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
Definition: SLPVectorizer.cpp:2328

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::LookAheadHeuristics
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
Definition: SLPVectorizer.cpp:2289

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplatLoads
static const int ScoreSplatLoads
The same load multiple times.
Definition: SLPVectorizer.cpp:2312

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedLoads
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
Definition: SLPVectorizer.cpp:2314

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConstants
static const int ScoreConstants
Constants.
Definition: SLPVectorizer.cpp:2322

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAltOpcodes
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
Definition: SLPVectorizer.cpp:2326

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveLoads
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
Definition: SLPVectorizer.cpp:2307

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedExtracts
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
Definition: SLPVectorizer.cpp:2320

llvm::slpvectorizer::BoUpSLP::VLOperands
A helper data structure to hold the operands of a vector of instructions.
Definition: SLPVectorizer.cpp:2590

llvm::slpvectorizer::BoUpSLP::VLOperands::getVL
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
Definition: SLPVectorizer.cpp:3201

llvm::slpvectorizer::BoUpSLP::VLOperands::getModeStr
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
Definition: SLPVectorizer.cpp:3354

llvm::slpvectorizer::BoUpSLP::VLOperands::dumpMode
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
Definition: SLPVectorizer.cpp:3376

llvm::slpvectorizer::BoUpSLP::VLOperands::dump
LLVM_DUMP_METHOD void dump() const
Debug print.
Definition: SLPVectorizer.cpp:3403

llvm::slpvectorizer::BoUpSLP::VLOperands::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
Definition: SLPVectorizer.cpp:3380

llvm::slpvectorizer::BoUpSLP::VLOperands::reorder
void reorder()
Definition: SLPVectorizer.cpp:3213

llvm::slpvectorizer::BoUpSLP::VLOperands::printMode
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
Definition: SLPVectorizer.cpp:3370

llvm::slpvectorizer::BoUpSLP::VLOperands::print
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Definition: SLPVectorizer.cpp:3384

llvm::slpvectorizer::BoUpSLP::VLOperands::VLOperands
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Definition: SLPVectorizer.cpp:3191

llvm::slpvectorizer::BoUpSLP
Bottom Up SLP Vectorizer.
Definition: SLPVectorizer.cpp:1912

llvm::slpvectorizer::BoUpSLP::isProfitableToReorder
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
Definition: SLPVectorizer.cpp:7767

llvm::slpvectorizer::BoUpSLP::OrdersType
SmallVector< unsigned, 4 > OrdersType
Definition: SLPVectorizer.cpp:1936

llvm::slpvectorizer::BoUpSLP::getRootNodeTypeWithNoCast
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
Definition: SLPVectorizer.cpp:2002

llvm::slpvectorizer::BoUpSLP::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
Definition: SLPVectorizer.cpp:4720

llvm::slpvectorizer::BoUpSLP::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
Definition: SLPVectorizer.cpp:4988

llvm::slpvectorizer::BoUpSLP::findPartiallyOrderedLoads
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
Definition: SLPVectorizer.cpp:7264

llvm::slpvectorizer::BoUpSLP::LoadsState
LoadsState
Tracks the state we can represent the loads in the given sequence.
Definition: SLPVectorizer.cpp:1923

llvm::slpvectorizer::BoUpSLP::LoadsState::CompressVectorize
@ CompressVectorize

llvm::slpvectorizer::BoUpSLP::LoadsState::ScatterVectorize
@ ScatterVectorize

llvm::slpvectorizer::BoUpSLP::LoadsState::Gather
@ Gather

llvm::slpvectorizer::BoUpSLP::LoadsState::Vectorize
@ Vectorize

llvm::slpvectorizer::BoUpSLP::LoadsState::StridedVectorize
@ StridedVectorize

llvm::slpvectorizer::BoUpSLP::reorderTopToBottom
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
Definition: SLPVectorizer.cpp:7892

llvm::slpvectorizer::BoUpSLP::reorderBottomToTop
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
Definition: SLPVectorizer.cpp:8234

llvm::slpvectorizer::BoUpSLP::getSpillCost
InstructionCost getSpillCost()
Definition: SLPVectorizer.cpp:15446

llvm::slpvectorizer::BoUpSLP::registerNonVectorizableLoads
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
Definition: SLPVectorizer.cpp:2234

llvm::slpvectorizer::BoUpSLP::getTreeCost
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
Definition: SLPVectorizer.cpp:15839

llvm::slpvectorizer::BoUpSLP::getTreeSize
unsigned getTreeSize() const
Definition: SLPVectorizer.cpp:2083

llvm::slpvectorizer::BoUpSLP::~BoUpSLP
~BoUpSLP()
Definition: SLPVectorizer.cpp:6006

llvm::slpvectorizer::BoUpSLP::areKnownNonVectorizableLoads
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
Definition: SLPVectorizer.cpp:2240

llvm::slpvectorizer::BoUpSLP::getCanonicalGraphSize
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
Definition: SLPVectorizer.cpp:2086

llvm::slpvectorizer::BoUpSLP::areAnalyzedReductionVals
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
Definition: SLPVectorizer.cpp:3540

llvm::slpvectorizer::BoUpSLP::canVectorizeLoads
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
Definition: SLPVectorizer.cpp:6841

llvm::slpvectorizer::BoUpSLP::isLoadCombineCandidate
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
Definition: SLPVectorizer.cpp:15223

llvm::slpvectorizer::BoUpSLP::analyzedReductionVals
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
Definition: SLPVectorizer.cpp:3545

llvm::slpvectorizer::BoUpSLP::isLoadCombineReductionCandidate
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
Definition: SLPVectorizer.cpp:15213

llvm::slpvectorizer::BoUpSLP::getVectorElementSize
unsigned getVectorElementSize(Value *V)
Definition: SLPVectorizer.cpp:21585

llvm::slpvectorizer::BoUpSLP::isSignedMinBitwidthRootNode
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
Definition: SLPVectorizer.cpp:2021

llvm::slpvectorizer::BoUpSLP::analyzedReductionRoot
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
Definition: SLPVectorizer.cpp:3535

llvm::slpvectorizer::BoUpSLP::getRootNodeScalars
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
Definition: SLPVectorizer.cpp:1995

llvm::slpvectorizer::BoUpSLP::computeMinimumValueSizes
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
Definition: SLPVectorizer.cpp:22082

llvm::slpvectorizer::BoUpSLP::deleteTree
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
Definition: SLPVectorizer.cpp:2053

llvm::slpvectorizer::BoUpSLP::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: SLPVectorizer.cpp:2175

llvm::slpvectorizer::BoUpSLP::ValueSet
SmallPtrSet< Value *, 16 > ValueSet
Definition: SLPVectorizer.cpp:1933

llvm::slpvectorizer::BoUpSLP::ExtraValueToDebugLocsMap
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
Definition: SLPVectorizer.cpp:1935

llvm::slpvectorizer::BoUpSLP::BoUpSLP
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
Definition: SLPVectorizer.cpp:1938

llvm::slpvectorizer::BoUpSLP::isNotScheduled
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Definition: SLPVectorizer.cpp:3563

llvm::slpvectorizer::BoUpSLP::transformNodes
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
Definition: SLPVectorizer.cpp:12718

llvm::slpvectorizer::BoUpSLP::isDeleted
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
Definition: SLPVectorizer.cpp:3433

llvm::slpvectorizer::BoUpSLP::buildExternalUses
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
Definition: SLPVectorizer.cpp:8652

llvm::slpvectorizer::BoUpSLP::isVectorized
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
Definition: SLPVectorizer.cpp:3568

llvm::slpvectorizer::BoUpSLP::isTreeTinyAndNotFullyVectorizable
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
Definition: SLPVectorizer.cpp:15236

llvm::slpvectorizer::BoUpSLP::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
Definition: SLPVectorizer.cpp:4881

llvm::slpvectorizer::BoUpSLP::canMapToVector
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
Definition: SLPVectorizer.cpp:11802

llvm::slpvectorizer::BoUpSLP::getMinVF
unsigned getMinVF(unsigned Sz) const
Definition: SLPVectorizer.cpp:2171

llvm::slpvectorizer::BoUpSLP::isAnalyzedReductionRoot
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
Definition: SLPVectorizer.cpp:3530

llvm::slpvectorizer::BoUpSLP::eraseInstruction
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
Definition: SLPVectorizer.cpp:3438

llvm::slpvectorizer::BoUpSLP::getORE
OptimizationRemarkEmitter * getORE()
Definition: SLPVectorizer.cpp:2244

llvm::slpvectorizer::BoUpSLP::isAnyGathered
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
Definition: SLPVectorizer.cpp:3555

llvm::slpvectorizer::BoUpSLP::ValueList
SmallVector< Value *, 8 > ValueList
Definition: SLPVectorizer.cpp:1931

llvm::slpvectorizer::BoUpSLP::removeInstructionsAndOperands
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Definition: SLPVectorizer.cpp:3445

llvm::slpvectorizer::BoUpSLP::isIdentityOrder
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
Definition: SLPVectorizer.cpp:2095

llvm::slpvectorizer::BoUpSLP::buildTree
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
Definition: SLPVectorizer.cpp:8899

llvm::slpvectorizer::BoUpSLP::isTreeNotExtendable
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
Definition: SLPVectorizer.cpp:15412

llvm::slpvectorizer::BoUpSLP::getReductionType
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
Definition: SLPVectorizer.cpp:2026

llvm::slpvectorizer::BoUpSLP::getMaxVecRegSize
unsigned getMaxVecRegSize() const
Definition: SLPVectorizer.cpp:2162

llvm::slpvectorizer::BoUpSLP::findReusedOrderedScalars
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
Definition: SLPVectorizer.cpp:6106

llvm::slpvectorizer::BoUpSLP::isGathered
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Definition: SLPVectorizer.cpp:3559

llvm::slpvectorizer::BoUpSLP::getMinVecRegSize
unsigned getMinVecRegSize() const
Definition: SLPVectorizer.cpp:2167

llvm::slpvectorizer::BoUpSLP::vectorizeTree
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
Definition: SLPVectorizer.cpp:19897

llvm::slpvectorizer::BoUpSLP::findBestRootPair
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
Definition: SLPVectorizer.cpp:3413

llvm::slpvectorizer::BoUpSLP::getReorderingData
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
Definition: SLPVectorizer.cpp:7346

llvm::slpvectorizer::BoUpSLP::clearReductionData
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
Definition: SLPVectorizer.cpp:3549

llvm::slpvectorizer::BoUpSLP::optimizeGatherSequence
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Definition: SLPVectorizer.cpp:20568

uint32_t

uint64_t

unsigned

llvm::VFDatabase::getVectorizedFunction
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:108

iterator.h

iterator_range.h
This provides a very simple, boring adaptor for a begin and end iterator into a range type.

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:164

llvm::AArch64CC::LS
@ LS
Definition: AArch64BaseInfo.h:264

llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:396

llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition: AMDGPUMetadata.h:488

llvm::AMDGPU::VGPRIndexMode::Id
Id
Definition: SIDefines.h:295

llvm::AMDGPU::P1
@ P1
Definition: AMDGPURegBankLegalizeRules.h:62

llvm::AMDGPU::P0
@ P0
Definition: AMDGPURegBankLegalizeRules.h:61

llvm::ARMII::HorizontalReduction
@ HorizontalReduction
Definition: ARMBaseInfo.h:425

llvm::ARM_AM::add
@ add
Definition: ARMAddressingModes.h:39

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::ARM_PROC::IE
@ IE
Definition: ARMBaseInfo.h:27

llvm::ARM::PredBlockMask::TE
@ TE

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126

llvm::COFF::Entry
@ Entry
Definition: COFF.h:862

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34

llvm::ISD::isBitwiseLogicOp
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1572

llvm::Intrinsic::getOrInsertDeclaration
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751

llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition: Intrinsics.h:46

llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition: LegalizerInfo.h:233

llvm::M68kBeads::Term
@ Term
Definition: M68kBaseInfo.h:116

llvm::M68k::MemAddrModeKind::U
@ U

llvm::M68k::MemAddrModeKind::V
@ V

llvm::M68k::MemAddrModeKind::u
@ u

llvm::M68k::MemAddrModeKind::K
@ K

llvm::M68k::MemAddrModeKind::L
@ L

llvm::MIPatternMatch::m_OneUse
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
Definition: MIPatternMatch.h:56

llvm::MipsISD::Ext
@ Ext
Definition: MipsISelLowering.h:157

llvm::MipsISD::Ins
@ Ins
Definition: MipsISelLowering.h:158

llvm::PatternMatch
Definition: PatternMatch.h:47

llvm::PatternMatch::m_Store
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
Definition: PatternMatch.h:2059

llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1296

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1182

llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100

llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1308

llvm::PatternMatch::m_FMul
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1254

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49

llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862

llvm::PatternMatch::m_FMaxNum
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
Definition: PatternMatch.h:2866

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962

llvm::PatternMatch::m_ExtractElt
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
Definition: PatternMatch.h:1966

llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168

llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition: PatternMatch.h:1928

llvm::PatternMatch::m_FMinimum
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
Definition: PatternMatch.h:2854

llvm::PatternMatch::m_SMin
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
Definition: PatternMatch.h:2484

llvm::PatternMatch::m_FMaximum
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
Definition: PatternMatch.h:2872

llvm::PatternMatch::m_FAdd
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1188

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1248

llvm::PatternMatch::m_LogicalOr
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
Definition: PatternMatch.h:3232

llvm::PatternMatch::m_Load
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
Definition: PatternMatch.h:2052

llvm::PatternMatch::m_ZExt
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition: PatternMatch.h:2243

llvm::PatternMatch::m_UMax
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
Definition: PatternMatch.h:2490

llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105

llvm::PatternMatch::m_SMax
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
Definition: PatternMatch.h:2478

llvm::PatternMatch::m_APInt
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92

llvm::PatternMatch::m_ZExtOrSExt
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
Definition: PatternMatch.h:2274

llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1314

llvm::PatternMatch::m_LogicalAnd
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
Definition: PatternMatch.h:3214

llvm::PatternMatch::m_Undef
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152

llvm::PatternMatch::m_FMinNum
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
Definition: PatternMatch.h:2848

llvm::PatternMatch::m_Or
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1302

llvm::PatternMatch::m_UMin
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Definition: PatternMatch.h:2496

llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239

llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:400

llvm::SIEncodingFamily::VI
@ VI
Definition: SIDefines.h:37

llvm::SIEncodingFamily::SI
@ SI
Definition: SIDefines.h:36

llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33

llvm::X86AS::GS
@ GS
Definition: X86.h:213

llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp

llvm::X86::FirstMacroFusionInstKind::AddSub
@ AddSub

llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind::Used
@ Used

llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444

llvm::codeview::CompileSym2Flags::EC
@ EC

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr

llvm::codeview::ExportFlags::IsConstant
@ IsConstant

llvm::detail::combineHashValue
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
Definition: DenseMapInfo.h:41

llvm::dwarf::Index
Index
Definition: Dwarf.h:889

llvm::dxil::ElementType::I1
@ I1

llvm::lltok::Kind
Kind
Definition: LLToken.h:18

llvm::logicalview::LVPrintKind::Instructions
@ Instructions

llvm::logicalview::LVAttributeKind::Inserted
@ Inserted

llvm::objcopy::AdjustKind::Set
@ Set

llvm::omp::RTLDependInfoFields::Len
@ Len

llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:139

llvm::pdb::PDB_MemoryType::Stack
@ Stack

llvm::pdb::Empty
@ Empty
Definition: PDBTypes.h:395

llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58

llvm::sys::path::begin
LLVM_ABI const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226

llvm::tgtok::In
@ In
Definition: TGLexer.h:84

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338

llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877

llvm::createSimpleReduction
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1313

llvm::doesNotNeedToBeScheduled
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
Definition: SLPVectorizer.cpp:1864

llvm::Offset
@ Offset
Definition: DWP.cpp:477

llvm::zip
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860

llvm::stable_sort
void stable_sort(R &&Range)
Definition: STLExtras.h:2077

llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1770

llvm::fill
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1764

llvm::for_each
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1737

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744

llvm::hash_value
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:137

llvm::getMinMaxReductionIntrinsicOp
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1023

llvm::isEqual
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Definition: GCNRegPressure.cpp:23

llvm::getAlign
MaybeAlign getAlign(const CallInst &I, unsigned Index)
Definition: NVPTXUtilities.cpp:335

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702

llvm::RecursivelyDeleteTriviallyDeadInstructions
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533

llvm::getVectorIntrinsicIDForCall
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
Definition: VectorUtils.cpp:235

llvm::reorderScalars
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
Definition: SLPVectorizer.cpp:1815

llvm::make_scope_exit
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59

llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491

llvm::pred_end
auto pred_end(const MachineBasicBlock *BB)
Definition: MachineBasicBlock.h:1430

llvm::set_intersect
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58

llvm::verifyFunction
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7502

llvm::salvageDebugInfo
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1723

llvm::Failed
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198

llvm::from_range
constexpr from_range_t from_range
Definition: STLForwardCompat.h:79

llvm::isUsedOutsideBlock
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
Definition: SLPVectorizer.cpp:1847

llvm::canConvertToMinOrMaxIntrinsic
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
Definition: ValueTracking.cpp:9075

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:77

llvm::set_is_subset
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
Definition: SetOperations.h:151

llvm::interleaveComma
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2250

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663

llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551

llvm::post_order
iterator_range< po_iterator< T > > post_order(const T &G)
Definition: PostOrderIterator.h:197

llvm::isSafeToSpeculativelyExecute
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
Definition: ValueTracking.cpp:6921

llvm::propagateMetadata
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition: VectorUtils.cpp:1078

llvm::binary_search
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition: STLExtras.h:2000

llvm::bit_ceil
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:295

llvm::isGather
bool isGather(IntrinsicInst *IntInst)
Definition: ARMBaseInstrInfo.h:924

llvm::getPointerOperand
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
Definition: Instructions.h:5102

llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390

llvm::MaskedValueIsZero
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
Definition: ValueTracking.cpp:315

llvm::erase
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2147

llvm::transform
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1987

llvm::has_single_bit
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:147

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751

llvm::isInstructionTriviallyDead
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:402

llvm::createStrideMask
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
Definition: VectorUtils.cpp:1163

llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428

llvm::inversePermutation
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
Definition: SLPVectorizer.cpp:1805

llvm::get
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Definition: PointerIntPair.h:269

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669

llvm::createReplicatedMask
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
Definition: VectorUtils.cpp:1143

llvm::ComplexDeinterleavingOperation::Splat
@ Splat

llvm::find_if_not
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1782

llvm::isSafeToLoadUnconditionally
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition: Loads.cpp:431

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207

llvm::hasFullVectorsOrPowerOf2
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
Definition: SLPVectorizer.cpp:1880

llvm::isPointerTy
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:288

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758

llvm::make_first_range
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition: STLExtras.h:1444

llvm::getPointersDiff
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
Definition: LoopAccessAnalysis.cpp:1628

llvm::wouldInstructionBeTriviallyDead
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:421

llvm::isModOrRefSet
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:43

llvm::is_sorted
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition: STLExtras.h:1939

llvm::sortPtrAccesses
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
Definition: LoopAccessAnalysis.cpp:1697

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548

llvm::propagateIRFlags
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1393

llvm::PoisonMaskElem
constexpr int PoisonMaskElem
Definition: Instructions.h:1922

llvm::ModRefInfo::Ref
@ Ref
The access may reference the value stored in memory.

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399

llvm::IRMemLocation::Other
@ Other
Any other memory.

llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.

llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:214

llvm::getMinMaxReductionPredicate
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1094

llvm::RecurKind
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34

llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.

llvm::RecurKind::Mul
@ Mul
Product of integers.

llvm::RecurKind::None
@ None
Not a recurrence.

llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.

llvm::RecurKind::FMulAdd
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).

llvm::RecurKind::FMul
@ FMul
Product of floats.

llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.

llvm::RecurKind::Add
@ Add
Sum of integers.

llvm::RecurKind::FAdd
@ FAdd
Sum of floats.

llvm::isVectorIntrinsicWithScalarOpAtArg
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
Definition: VectorUtils.cpp:148

llvm::areAllOperandsNonInsts
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
Definition: SLPVectorizer.cpp:1830

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155

llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1973

llvm::Op
DWARFExpression::Operation Op
Definition: DWARFExpressionPrinter.cpp:22

llvm::max_element
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2049

llvm::ViewGraph
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:443

llvm::ComputeNumSignBits
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition: ValueTracking.cpp:333

llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854

llvm::make_second_range
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition: STLExtras.h:1454

llvm::doesNotNeedToSchedule
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
Definition: SLPVectorizer.cpp:1872

llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223

llvm::isGuaranteedToTransferExecutionToSuccessor
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
Definition: ValueTracking.cpp:7804

llvm::PseudoProbeReservedId::Last
@ Last

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980

llvm::getNumberOfParts
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
Definition: SLPVectorizer.cpp:1897

llvm::pred_begin
auto pred_begin(const MachineBasicBlock *BB)
Definition: MachineBasicBlock.h:1428

llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916

llvm::Cost
InstructionCost Cost
Definition: FunctionSpecialization.h:103

llvm::seq
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305

llvm::VFParamKind::Vector
@ Vector

llvm::hash_combine
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:595

llvm::equal
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2107

llvm::isGuaranteedNotToBePoison
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Definition: ValueTracking.cpp:7733

llvm::bit_floor
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:280

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition: ValueTracking.cpp:6646

llvm::ConstantFoldIntegerCast
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
Definition: ConstantFolding.cpp:1572

llvm::Data
@ Data
Definition: SIMachineScheduler.h:55

llvm::isKnownNonNegative
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
Definition: ValueTracking.cpp:278

llvm::mayHaveNonDefUseDependency
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
Definition: ValueTracking.cpp:7037

llvm::isTriviallyVectorizable
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46

llvm::Counters
@ Counters
Definition: PGOCtxProfWriter.h:28

llvm::Invalid
@ Invalid
Definition: PGOCtxProfWriter.h:24

llvm::isVectorIntrinsicWithOverloadTypeAtArg
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
Definition: VectorUtils.cpp:182

llvm::hash_combine_range
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:469

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858

true
Definition: SPIRVConvergenceRegionAnalysis.cpp:40

raw_ostream.h

N
#define N

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39

llvm::CallBase::BundleOpInfo
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2169

llvm::CodeMetrics::collectEphemeralValues
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71

llvm::DOTGraphTraits< BoUpSLP * >::TreeEntry
BoUpSLP::TreeEntry TreeEntry
Definition: SLPVectorizer.cpp:5971

llvm::DOTGraphTraits< BoUpSLP * >::getNodeLabel
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
Definition: SLPVectorizer.cpp:5975

llvm::DOTGraphTraits< BoUpSLP * >::getNodeAttributes
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
Definition: SLPVectorizer.cpp:5992

llvm::DOTGraphTraits< BoUpSLP * >::DOTGraphTraits
DOTGraphTraits(bool IsSimple=false)
Definition: SLPVectorizer.cpp:5973

llvm::DOTGraphTraits
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
Definition: DOTGraphTraits.h:166

llvm::DefaultDOTGraphTraits
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Definition: DOTGraphTraits.h:28

llvm::DenseMapInfo< BoUpSLP::EdgeInfo >::isEqual
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
Definition: SLPVectorizer.cpp:5903

llvm::DenseMapInfo< BoUpSLP::EdgeInfo >::getEmptyKey
static BoUpSLP::EdgeInfo getEmptyKey()
Definition: SLPVectorizer.cpp:5888

llvm::DenseMapInfo< BoUpSLP::EdgeInfo >::getHashValue
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
Definition: SLPVectorizer.cpp:5898

llvm::DenseMapInfo< BoUpSLP::EdgeInfo >::getTombstoneKey
static BoUpSLP::EdgeInfo getTombstoneKey()
Definition: SLPVectorizer.cpp:5893

llvm::DenseMapInfo
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:54

llvm::DiagnosticInfoOptimizationBase::Argument
Used in the streaming interface as the general argument type.
Definition: DiagnosticInfo.h:536

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::operator*
NodeRef operator*()
Definition: SLPVectorizer.cpp:5928

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::ChildIteratorType
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
Definition: SLPVectorizer.cpp:5924

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::VectorizableTree
ContainerTy & VectorizableTree
Definition: SLPVectorizer.cpp:5922

llvm::GraphTraits< BoUpSLP * >::child_end
static ChildIteratorType child_end(NodeRef N)
Definition: SLPVectorizer.cpp:5939

llvm::GraphTraits< BoUpSLP * >::getEntryNode
static NodeRef getEntryNode(BoUpSLP &R)
Definition: SLPVectorizer.cpp:5931

llvm::GraphTraits< BoUpSLP * >::child_begin
static ChildIteratorType child_begin(NodeRef N)
Definition: SLPVectorizer.cpp:5935

llvm::GraphTraits< BoUpSLP * >::nodes_begin
static nodes_iterator nodes_begin(BoUpSLP *R)
Definition: SLPVectorizer.cpp:5959

llvm::GraphTraits< BoUpSLP * >::NodeRef
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
Definition: SLPVectorizer.cpp:5913

llvm::GraphTraits< BoUpSLP * >::size
static unsigned size(BoUpSLP *R)
Definition: SLPVectorizer.cpp:5967

llvm::GraphTraits< BoUpSLP * >::TreeEntry
BoUpSLP::TreeEntry TreeEntry
Definition: SLPVectorizer.cpp:5910

llvm::GraphTraits< BoUpSLP * >::nodes_end
static nodes_iterator nodes_end(BoUpSLP *R)
Definition: SLPVectorizer.cpp:5963

llvm::GraphTraits
Definition: GraphTraits.h:38

llvm::Incoming
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Definition: SILowerI1Copies.h:26

llvm::Loop::LoopBounds::Direction
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:217

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117

llvm::MinMax
Definition: AssumeBundleQueries.h:72

llvm::SLPVectorizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: SLPVectorizer.cpp:22447

llvm::SLPVectorizerPass::runImpl
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Definition: SLPVectorizer.cpp:22467

llvm::SimplifyQuery
Definition: SimplifyQuery.h:71

llvm::SmallMapVector
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:249

llvm::TargetTransformInfo::OperandValueInfo
Definition: TargetTransformInfo.h:1160

llvm::TargetTransformInfo::PointersChainInfo
Describe known properties for a set of pointers.
Definition: TargetTransformInfo.h:319

llvm::VFShape
Contains the information about the kind of vectorization available.
Definition: VFABIDemangler.h:84

llvm::VFShape::get
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Definition: VFABIDemangler.h:109

llvm::cl::desc
Definition: CommandLine.h:410

llvm::less_first
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1472

llvm::less_second
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1481

llvm::slpvectorizer::BoUpSLP::EdgeInfo
This structure holds any data we need about the edges being traversed during buildTreeRec().
Definition: SLPVectorizer.cpp:2250

llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeIdx
unsigned EdgeIdx
The operand index of the use.
Definition: SLPVectorizer.cpp:2257

llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeInfo
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
Definition: SLPVectorizer.cpp:2252

llvm::slpvectorizer::BoUpSLP::EdgeInfo::dump
LLVM_DUMP_METHOD void dump() const
Definition: SLPVectorizer.cpp:2269

llvm::slpvectorizer::BoUpSLP::EdgeInfo::UserTE
TreeEntry * UserTE
The user TreeEntry.
Definition: SLPVectorizer.cpp:2255

llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeInfo
EdgeInfo()=default

llvm::slpvectorizer::BoUpSLP::EdgeInfo::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
Definition: SLPVectorizer.cpp:2259

llvm::slpvectorizer::BoUpSLP::EdgeInfo::dump
void dump(raw_ostream &OS) const
Debug print.
Definition: SLPVectorizer.cpp:2265