LLVM: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp Source File

//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This pass replaces accesses to kernel arguments with loads from

/// offsets from the kernarg base pointer.

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/Target/TargetMachine.h"


#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"


using namespace llvm;


namespace {


class AMDGPULowerKernelArguments : public FunctionPass {

public:

  static char ID;


  AMDGPULowerKernelArguments() : FunctionPass(ID) {}


  bool runOnFunction(Function &F) override;


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.addRequired<TargetPassConfig>();

    AU.setPreservesAll();

 }

};


} // end anonymous namespace


// skip allocas

static BasicBlock::iterator getInsertPt(BasicBlock &BB) {

  BasicBlock::iterator InsPt = BB.getFirstInsertionPt();

  for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {

    AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);


    // If this is a dynamic alloca, the value may depend on the loaded kernargs,

    // so loads will need to be inserted before it.

    if (!AI || !AI->isStaticAlloca())

      break;

  }


  return InsPt;

}


static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {

  CallingConv::ID CC = F.getCallingConv();

  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())

    return false;


  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

  LLVMContext &Ctx = F.getParent()->getContext();

  const DataLayout &DL = F.getDataLayout();

  BasicBlock &EntryBlock = *F.begin();

  IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));


  const Align KernArgBaseAlign(16); // FIXME: Increase if necessary

  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();


  Align MaxAlign;

  // FIXME: Alignment is broken with explicit arg offset.;

  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);

  if (TotalKernArgSize == 0)

    return false;


  CallInst *KernArgSegment =

      Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {},

                              nullptr, F.getName() + ".kernarg.segment");

  KernArgSegment->addRetAttr(Attribute::NonNull);

  KernArgSegment->addRetAttr(

      Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));


  uint64_t ExplicitArgOffset = 0;

  for (Argument &Arg : F.args()) {

    const bool IsByRef = Arg.hasByRefAttr();

    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();

    MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;

    Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);


    uint64_t Size = DL.getTypeSizeInBits(ArgTy);

    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);


    uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;

    ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;


    // Skip inreg arguments which should be preloaded.

    if (Arg.use_empty() || Arg.hasInRegAttr())

      continue;


    // If this is byval, the loads are already explicit in the function. We just

    // need to rewrite the pointer values.

    if (IsByRef) {

      Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(

          Builder.getInt8Ty(), KernArgSegment, EltOffset,

          Arg.getName() + ".byval.kernarg.offset");


      Value *CastOffsetPtr =

          Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType());

      Arg.replaceAllUsesWith(CastOffsetPtr);

      continue;

    }


    if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {

      // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing

      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We

      // can't represent this with range metadata because it's only allowed for

      // integer types.

      if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||

           PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&

          !ST.hasUsableDSOffset())

        continue;


      // FIXME: We can replace this with equivalent alias.scope/noalias

      // metadata, but this appears to be a lot of work.

      if (Arg.hasNoAliasAttr())

        continue;

    }


    auto *VT = dyn_cast<FixedVectorType>(ArgTy);

    bool IsV3 = VT && VT->getNumElements() == 3;

    bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();


    VectorType *V4Ty = nullptr;


    int64_t AlignDownOffset = alignDown(EltOffset, 4);

    int64_t OffsetDiff = EltOffset - AlignDownOffset;

    Align AdjustedAlign = commonAlignment(

        KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);


    Value *ArgPtr;

    Type *AdjustedArgTy;

    if (DoShiftOpt) { // FIXME: Handle aggregate types

      // Since we don't have sub-dword scalar loads, avoid doing an extload by

      // loading earlier than the argument address, and extracting the relevant

      // bits.

      // TODO: Update this for GFX12 which does have scalar sub-dword loads.

      //

      // Additionally widen any sub-dword load to i32 even if suitably aligned,

      // so that CSE between different argument loads works easily.

      ArgPtr = Builder.CreateConstInBoundsGEP1_64(

          Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,

          Arg.getName() + ".kernarg.offset.align.down");

      AdjustedArgTy = Builder.getInt32Ty();

    } else {

      ArgPtr = Builder.CreateConstInBoundsGEP1_64(

          Builder.getInt8Ty(), KernArgSegment, EltOffset,

          Arg.getName() + ".kernarg.offset");

      AdjustedArgTy = ArgTy;

    }


    if (IsV3 && Size >= 32) {

      V4Ty = FixedVectorType::get(VT->getElementType(), 4);

      // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads

      AdjustedArgTy = V4Ty;

    }


    LoadInst *Load =

        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);

    Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));


    MDBuilder MDB(Ctx);


    if (Arg.hasAttribute(Attribute::NoUndef))

      Load->setMetadata(LLVMContext::MD_noundef, MDNode::get(Ctx, {}));


    if (Arg.hasAttribute(Attribute::Range)) {

      const ConstantRange &Range =

          Arg.getAttribute(Attribute::Range).getValueAsConstantRange();

      Load->setMetadata(LLVMContext::MD_range,

                        MDB.createRange(Range.getLower(), Range.getUpper()));

    }


    if (isa<PointerType>(ArgTy)) {

      if (Arg.hasNonNullAttr())

        Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));


      uint64_t DerefBytes = Arg.getDereferenceableBytes();

      if (DerefBytes != 0) {

        Load->setMetadata(

          LLVMContext::MD_dereferenceable,

          MDNode::get(Ctx,

                      MDB.createConstant(

                        ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));

      }


      uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();

      if (DerefOrNullBytes != 0) {

        Load->setMetadata(

          LLVMContext::MD_dereferenceable_or_null,

          MDNode::get(Ctx,

                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),

                                                          DerefOrNullBytes))));

      }


      if (MaybeAlign ParamAlign = Arg.getParamAlign()) {

        Load->setMetadata(

            LLVMContext::MD_align,

            MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(

                                 Builder.getInt64Ty(), ParamAlign->value()))));

      }

    }


    // TODO: Convert noalias arg to !noalias


    if (DoShiftOpt) {

      Value *ExtractBits = OffsetDiff == 0 ?

        Load : Builder.CreateLShr(Load, OffsetDiff * 8);


      IntegerType *ArgIntTy = Builder.getIntNTy(Size);

      Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);

      Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,

                                            Arg.getName() + ".load");

      Arg.replaceAllUsesWith(NewVal);

    } else if (IsV3) {

      Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},

                                                Arg.getName() + ".load");

      Arg.replaceAllUsesWith(Shuf);

    } else {

      Load->setName(Arg.getName() + ".load");

      Arg.replaceAllUsesWith(Load);

    }

  }


  KernArgSegment->addRetAttr(

      Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));


  return true;

}


bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {

  auto &TPC = getAnalysis<TargetPassConfig>();

  const TargetMachine &TM = TPC.getTM<TargetMachine>();

  return lowerKernelArguments(F, TM);

}


INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,

                      "AMDGPU Lower Kernel Arguments", false, false)

INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",

                    false, false)


char AMDGPULowerKernelArguments::ID = 0;


FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {

  return new AMDGPULowerKernelArguments();

}


PreservedAnalyses

AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) {

  bool Changed = lowerKernelArguments(F, TM);

  if (Changed) {

    // TODO: Preserves a lot more.

    PreservedAnalyses PA;

    PA.preserveSet<CFGAnalyses>();

    return PA;

  }


  return PreservedAnalyses::all();

}

Arguments
AMDGPU Lower Kernel Arguments
Definition: AMDGPULowerKernelArguments.cpp:253

getInsertPt
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
Definition: AMDGPULowerKernelArguments.cpp:47

lowerKernelArguments
static bool lowerKernelArguments(Function &F, const TargetMachine &TM)
Definition: AMDGPULowerKernelArguments.cpp:61

DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPULowerKernelArguments.cpp:24

AMDGPU.h

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...

Size
uint64_t Size
Definition: ELFObjHandler.cpp:81

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

IRBuilder.h

F
#define F(x, y, z)
Definition: MD5.cpp:55

MDBuilder.h

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:39

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

ValueTracking.h

PointerType
Definition: ItaniumDemangle.h:639

VectorType
Definition: ItaniumDemangle.h:1189

llvm::AMDGPULowerKernelArgumentsPass::run
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
Definition: AMDGPULowerKernelArguments.cpp:263

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:64

llvm::AllocaInst::isStaticAlloca
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1299

llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:48

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:76

llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:131

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41

llvm::Attribute::getWithDereferenceableBytes
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
Definition: Attributes.cpp:244

llvm::Attribute::getWithAlignment
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234

llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:62

llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:472

llvm::BasicBlock::getFirstInsertionPt
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:393

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73

llvm::CallBase::addRetAttr
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition: InstrTypes.h:1491

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1510

llvm::ConstantRange
This class represents a range of values.
Definition: ConstantRange.h:47

llvm::ConstantRange::getLower
const APInt & getLower() const
Return the lower value for this range.
Definition: ConstantRange.h:209

llvm::ConstantRange::getUpper
const APInt & getUpper() const
Return the upper value for this range.
Definition: ConstantRange.h:212

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314

llvm::FunctionPass::runOnFunction
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.

llvm::Function
Definition: Function.h:64

llvm::GCNSubtarget
Definition: GCNSubtarget.h:34

llvm::IRBuilderBase::getIntNTy
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575

llvm::IRBuilderBase::CreateAlignedLoad
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864

llvm::IRBuilderBase::CreateLShr
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513

llvm::IRBuilderBase::getInt32Ty
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:562

llvm::IRBuilderBase::getInt64Ty
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:567

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834

llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204

llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593

llvm::IRBuilderBase::CreateTrunc
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068

llvm::IRBuilderBase::CreateConstInBoundsGEP1_64
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition: IRBuilder.h:1993

llvm::IRBuilderBase::getInt8Ty
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:552

llvm::IRBuilderBase::CreateAddrSpaceCast
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2209

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780

llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:42

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:180

llvm::MDBuilder
Definition: MDBuilder.h:37

llvm::MDBuilder::createConstant
LLVM_ABI ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
Definition: MDBuilder.cpp:25

llvm::MDBuilder::createRange
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:96

llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1565

llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:112

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83

llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45

llvm::Type::isAggregateType
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:304

llvm::Value
LLVM Value Representation.
Definition: Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322

uint64_t

unsigned

TargetMachine.h

false
Definition: MachinePipeliner.cpp:239

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPUAddrSpace.h:34

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200

llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:66

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551

llvm::HexPrintStyle::Lower
@ Lower

llvm::createAMDGPULowerKernelArgumentsPass
FunctionPass * createAMDGPULowerKernelArgumentsPass()
Definition: AMDGPULowerKernelArguments.cpp:258

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117