LLVM: lib/Target/NVPTX/NVPTXSubtarget.h Source File

//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This file declares the NVPTX specific subclass of TargetSubtarget.

//

//===----------------------------------------------------------------------===//


#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H

#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H


#include "NVPTX.h"

#include "NVPTXFrameLowering.h"

#include "NVPTXISelLowering.h"

#include "NVPTXInstrInfo.h"

#include "NVPTXRegisterInfo.h"

#include "llvm/CodeGen/TargetSubtargetInfo.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/Support/NVPTXAddrSpace.h"

#include <string>


#define GET_SUBTARGETINFO_HEADER

#include "NVPTXGenSubtargetInfo.inc"


namespace llvm {


class NVPTXSubtarget : public NVPTXGenSubtargetInfo {

  virtual void anchor();

  std::string TargetName;


  // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31

  unsigned PTXVersion;


  // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310

  // sm_90a == 901

  unsigned int FullSmVersion;


  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from

  // FullSmVersion.

  unsigned int SmVersion;


  NVPTXInstrInfo InstrInfo;

  NVPTXTargetLowering TLInfo;

  std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;


  // NVPTX does not have any call stack frame, but need a NVPTX specific

  // FrameLowering class because TargetFrameLowering is abstract.

  NVPTXFrameLowering FrameLowering;


public:

  /// This constructor initializes the data members to match that

  /// of the specified module.

  ///

  NVPTXSubtarget(const Triple &TT, const std::string &CPU,

                 const std::string &FS, const NVPTXTargetMachine &TM);


  ~NVPTXSubtarget() override;


  const TargetFrameLowering *getFrameLowering() const override {

    return &FrameLowering;

  }


  const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }


  const NVPTXRegisterInfo *getRegisterInfo() const override {

    return &InstrInfo.getRegisterInfo();

  }


  const NVPTXTargetLowering *getTargetLowering() const override {

    return &TLInfo;

  }


  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;


  bool has256BitVectorLoadStore(unsigned AS) const {

    return SmVersion >= 100 && PTXVersion >= 88 &&

           AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;

  }


  bool hasAtomAddF64() const { return SmVersion >= 60; }

  bool hasAtomScope() const { return SmVersion >= 60; }

  bool hasAtomBitwise64() const { return SmVersion >= 32; }

  bool hasAtomMinMax64() const { return SmVersion >= 32; }

  bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }

  bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }

  bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }

  bool hasLDG() const { return SmVersion >= 32; }

  bool hasHWROT32() const { return SmVersion >= 32; }

  bool hasFP16Math() const { return SmVersion >= 53; }

  bool hasBF16Math() const { return SmVersion >= 80; }

  bool allowFP16Math() const;

  bool hasMaskOperator() const { return PTXVersion >= 71; }

  bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }

  // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,

  // release, acq_rel, sc) ?

  bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }

  // Does SM & PTX support .acquire and .release qualifiers for fence?


  bool hasSplitAcquireAndReleaseFences() const {

    return SmVersion >= 90 && PTXVersion >= 86;

  }


  // Does SM & PTX support atomic relaxed MMIO operations ?

  bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }


  bool hasDotInstructions() const {

    return SmVersion >= 61 && PTXVersion >= 50;

  }


  // Tcgen05 instructions in Blackwell family


  bool hasTcgen05Instructions() const {

    bool HasTcgen05 = false;

    unsigned MinPTXVersion = 86;

    switch (FullSmVersion) {

    default:

      break;

    case 1003: // sm_100a

    case 1013: // sm_101a

      HasTcgen05 = true;

      break;

    case 1033: // sm_103a

      HasTcgen05 = true;

      MinPTXVersion = 88;

      break;

    }


    return HasTcgen05 && PTXVersion >= MinPTXVersion;

  }


  // f32x2 instructions in Blackwell family

  bool hasF32x2Instructions() const;


  // TMA G2S copy with cta_group::1/2 support


  bool hasCpAsyncBulkTensorCTAGroupSupport() const {

    // TODO: Update/tidy-up after the family-conditional support arrives

    switch (FullSmVersion) {

    case 1003:

    case 1013:

      return PTXVersion >= 86;

    case 1033:

      return PTXVersion >= 88;

    default:

      return false;

    }

  }


  // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction

  // terminates a basic block. Instead, it would assume that control flow

  // continued to the next instruction. The next instruction could be in the

  // block that's lexically below it. This would lead to a phantom CFG edges

  // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when

  // PTX ISA versions 8.3+ we can confidently say that the bug will not be

  // present.

  bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }

  bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }

  unsigned int getFullSmVersion() const { return FullSmVersion; }

  unsigned int getSmVersion() const { return getFullSmVersion() / 10; }

  // GPUs with "a" suffix have architecture-accelerated features that are

  // supported on the specified architecture only, hence such targets do not

  // follow the onion layer model. hasArchAccelFeatures() allows distinguishing

  // such GPU variants from the base GPU architecture.

  // - false represents non-accelerated architecture.

  // - true represents architecture-accelerated variant.


  bool hasArchAccelFeatures() const {

    return (getFullSmVersion() & 1) && PTXVersion >= 80;

  }


  // GPUs with 'f' suffix have architecture-accelerated features which are

  // portable across all future architectures under same SM major. For example,

  // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.

  // - false represents non-family-specific architecture.

  // - true represents family-specific variant.


  bool hasFamilySpecificFeatures() const {

    return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88

                                        : hasArchAccelFeatures();

  }


  // If the user did not provide a target we default to the `sm_30` target.


  std::string getTargetName() const {

    return TargetName.empty() ? "sm_30" : TargetName;

  }


  bool hasTargetName() const { return !TargetName.empty(); }


  bool hasNativeBF16Support(int Opcode) const;


  // Get maximum value of required alignments among the supported data types.

  // From the PTX ISA doc, section 8.2.3:

  //  The memory consistency model relates operations executed on memory

  //  locations with scalar data-types, which have a maximum size and alignment

  //  of 64 bits. Memory operations with a vector data-type are modelled as a

  //  set of equivalent memory operations with a scalar data-type, executed in

  //  an unspecified order on the elements in the vector.

  unsigned getMaxRequiredAlignment() const { return 8; }

  // Get the smallest cmpxchg word size that the hardware supports.

  unsigned getMinCmpXchgSizeInBits() const { return 32; }


  unsigned getPTXVersion() const { return PTXVersion; }


  NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);

  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);


  void failIfClustersUnsupported(std::string const &FailureMessage) const;

};


} // End llvm namespace


#endif

DataLayout.h

NVPTXAddrSpace.h
NVPTX address space definition.

NVPTXFrameLowering.h

NVPTXISelLowering.h

NVPTXInstrInfo.h

NVPTXRegisterInfo.h

NVPTX.h

TargetSubtargetInfo.h

NVPTXGenSubtargetInfo

llvm::NVPTXFrameLowering
Definition NVPTXFrameLowering.h:21

llvm::NVPTXInstrInfo
Definition NVPTXInstrInfo.h:26

llvm::NVPTXRegisterInfo
Definition NVPTXRegisterInfo.h:24

llvm::NVPTXSubtarget
Definition NVPTXSubtarget.h:31

llvm::NVPTXSubtarget::hasCpAsyncBulkTensorCTAGroupSupport
bool hasCpAsyncBulkTensorCTAGroupSupport() const
Definition NVPTXSubtarget.h:129

llvm::NVPTXSubtarget::getInstrInfo
const NVPTXInstrInfo * getInstrInfo() const override
Definition NVPTXSubtarget.h:66

llvm::NVPTXSubtarget::failIfClustersUnsupported
void failIfClustersUnsupported(std::string const &FailureMessage) const
Definition NVPTXSubtarget.cpp:115

llvm::NVPTXSubtarget::getTargetName
std::string getTargetName() const
Definition NVPTXSubtarget.h:172

llvm::NVPTXSubtarget::getMaxRequiredAlignment
unsigned getMaxRequiredAlignment() const
Definition NVPTXSubtarget.h:186

llvm::NVPTXSubtarget::hasAtomMinMax64
bool hasAtomMinMax64() const
Definition NVPTXSubtarget.h:83

llvm::NVPTXSubtarget::hasAtomAddF64
bool hasAtomAddF64() const
Definition NVPTXSubtarget.h:80

llvm::NVPTXSubtarget::hasHWROT32
bool hasHWROT32() const
Definition NVPTXSubtarget.h:88

llvm::NVPTXSubtarget::hasSplitAcquireAndReleaseFences
bool hasSplitAcquireAndReleaseFences() const
Definition NVPTXSubtarget.h:98

llvm::NVPTXSubtarget::hasClusters
bool hasClusters() const
Definition NVPTXSubtarget.h:86

llvm::NVPTXSubtarget::hasMaskOperator
bool hasMaskOperator() const
Definition NVPTXSubtarget.h:92

llvm::NVPTXSubtarget::getTargetLowering
const NVPTXTargetLowering * getTargetLowering() const override
Definition NVPTXSubtarget.h:70

llvm::NVPTXSubtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)

llvm::NVPTXSubtarget::getMinCmpXchgSizeInBits
unsigned getMinCmpXchgSizeInBits() const
Definition NVPTXSubtarget.h:188

llvm::NVPTXSubtarget::getPTXVersion
unsigned getPTXVersion() const
Definition NVPTXSubtarget.h:190

llvm::NVPTXSubtarget::hasCvtaParam
bool hasCvtaParam() const
Definition NVPTXSubtarget.h:150

llvm::NVPTXSubtarget::~NVPTXSubtarget
~NVPTXSubtarget() override

llvm::NVPTXSubtarget::hasNativeBF16Support
bool hasNativeBF16Support(int Opcode) const
Definition NVPTXSubtarget.cpp:83

llvm::NVPTXSubtarget::getRegisterInfo
const NVPTXRegisterInfo * getRegisterInfo() const override
Definition NVPTXSubtarget.h:67

llvm::NVPTXSubtarget::getFullSmVersion
unsigned int getFullSmVersion() const
Definition NVPTXSubtarget.h:151

llvm::NVPTXSubtarget::getSmVersion
unsigned int getSmVersion() const
Definition NVPTXSubtarget.h:152

llvm::NVPTXSubtarget::hasDotInstructions
bool hasDotInstructions() const
Definition NVPTXSubtarget.h:103

llvm::NVPTXSubtarget::hasFamilySpecificFeatures
bool hasFamilySpecificFeatures() const
Definition NVPTXSubtarget.h:167

llvm::NVPTXSubtarget::hasTcgen05Instructions
bool hasTcgen05Instructions() const
Definition NVPTXSubtarget.h:107

llvm::NVPTXSubtarget::hasAtomBitwise64
bool hasAtomBitwise64() const
Definition NVPTXSubtarget.h:82

llvm::NVPTXSubtarget::hasRelaxedMMIO
bool hasRelaxedMMIO() const
Definition NVPTXSubtarget.h:102

llvm::NVPTXSubtarget::hasTargetName
bool hasTargetName() const
Definition NVPTXSubtarget.h:175

llvm::NVPTXSubtarget::hasBF16Math
bool hasBF16Math() const
Definition NVPTXSubtarget.h:90

llvm::NVPTXSubtarget::hasAtomSwap128
bool hasAtomSwap128() const
Definition NVPTXSubtarget.h:85

llvm::NVPTXSubtarget::hasLDG
bool hasLDG() const
Definition NVPTXSubtarget.h:87

llvm::NVPTXSubtarget::hasF32x2Instructions
bool hasF32x2Instructions() const
Definition NVPTXSubtarget.cpp:79

llvm::NVPTXSubtarget::allowFP16Math
bool allowFP16Math() const
Definition NVPTXSubtarget.cpp:75

llvm::NVPTXSubtarget::getFrameLowering
const TargetFrameLowering * getFrameLowering() const override
Definition NVPTXSubtarget.h:63

llvm::NVPTXSubtarget::hasAtomScope
bool hasAtomScope() const
Definition NVPTXSubtarget.h:81

llvm::NVPTXSubtarget::hasAtomCas16
bool hasAtomCas16() const
Definition NVPTXSubtarget.h:84

llvm::NVPTXSubtarget::NVPTXSubtarget
NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM)
This constructor initializes the data members to match that of the specified module.
Definition NVPTXSubtarget.cpp:60

llvm::NVPTXSubtarget::hasMemoryOrdering
bool hasMemoryOrdering() const
Definition NVPTXSubtarget.h:96

llvm::NVPTXSubtarget::hasArchAccelFeatures
bool hasArchAccelFeatures() const
Definition NVPTXSubtarget.h:159

llvm::NVPTXSubtarget::initializeSubtargetDependencies
NVPTXSubtarget & initializeSubtargetDependencies(StringRef CPU, StringRef FS)
Definition NVPTXSubtarget.cpp:41

llvm::NVPTXSubtarget::getSelectionDAGInfo
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
Definition NVPTXSubtarget.cpp:71

llvm::NVPTXSubtarget::has256BitVectorLoadStore
bool has256BitVectorLoadStore(unsigned AS) const
Definition NVPTXSubtarget.h:76

llvm::NVPTXSubtarget::hasPTXASUnreachableBug
bool hasPTXASUnreachableBug() const
Definition NVPTXSubtarget.h:149

llvm::NVPTXSubtarget::hasFP16Math
bool hasFP16Math() const
Definition NVPTXSubtarget.h:89

llvm::NVPTXSubtarget::hasNoReturn
bool hasNoReturn() const
Definition NVPTXSubtarget.h:93

llvm::NVPTXTargetLowering
Definition NVPTXISelLowering.h:111

llvm::NVPTXTargetMachine
NVPTXTargetMachine.
Definition NVPTXTargetMachine.h:25

llvm::SelectionDAGTargetInfo
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
Definition SelectionDAGTargetInfo.h:33

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition TargetFrameLowering.h:46

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47

llvm::NVPTXAS::ADDRESS_SPACE_GLOBAL
@ ADDRESS_SPACE_GLOBAL
Definition NVPTXAddrSpace.h:23

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18