[AMDGPU] Lower enqueued blocks and generate runtime metadata

This patch adds a post-linking pass which replaces the function pointer of enqueued
block kernel with a global variable (runtime handle) and adds
runtime-handle attribute to the enqueued block kernel.

In LLVM CodeGen the runtime-handle metadata will be translated to
RuntimeHandle metadata in code object. Runtime allocates a global buffer
for each kernel with RuntimeHandel metadata and saves the kernel address
required for the AQL packet into the buffer. __enqueue_kernel function
in device library knows that the invoke function pointer in the block
literal is actually runtime handle and loads the kernel address from it
and puts it into AQL packet for dispatching.

This cannot be done in FE since FE cannot create a unique global variable
with external linkage across LLVM modules. The global variable with internal
linkage does not work since optimization passes will try to replace loads
of the global variable with its initialization value.

Differential Revision: https://reviews.llvm.org/D38610

llvm-svn: 315352
This commit is contained in:
Yaxun Liu 2017-10-10 19:39:48 +00:00
parent 0f9e889881
commit de4b88d9a1
10 changed files with 266 additions and 6 deletions

View File

@ -930,6 +930,16 @@ non-AMD key names should be prefixed by "*vendor-name*.".
Corresponds to the OpenCL
``vec_type_hint`` attribute.
"RuntimeHandle" string The external symbol name
associated with a kernel.
OpenCL runtime allocates a
global buffer for the symbol
and saves the kernel's address
to it, which is used for
device side enqueueing. Only
available for device side
enqueued kernels.
=================== ============== ========= ==============================
..

View File

@ -115,6 +115,8 @@ constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize";
constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint";
/// \brief Key for Kernel::Attr::Metadata::mVecTypeHint.
constexpr char VecTypeHint[] = "VecTypeHint";
/// \brief Key for Kernel::Attr::Metadata::mRuntimeHandle.
constexpr char RuntimeHandle[] = "RuntimeHandle";
} // end namespace Key
/// \brief In-memory representation of kernel attributes metadata.
@ -125,15 +127,17 @@ struct Metadata final {
std::vector<uint32_t> mWorkGroupSizeHint = std::vector<uint32_t>();
/// \brief 'vec_type_hint' attribute. Optional.
std::string mVecTypeHint = std::string();
/// \brief External symbol created by runtime to store the kernel address
/// for enqueued blocks.
std::string mRuntimeHandle = std::string();
/// \brief Default constructor.
Metadata() = default;
/// \returns True if kernel attributes metadata is empty, false otherwise.
bool empty() const {
return mReqdWorkGroupSize.empty() &&
mWorkGroupSizeHint.empty() &&
mVecTypeHint.empty();
return mReqdWorkGroupSize.empty() && mWorkGroupSizeHint.empty() &&
mVecTypeHint.empty() && mRuntimeHandle.empty();
}
/// \returns True if kernel attributes metadata is not empty, false otherwise.

View File

@ -96,6 +96,8 @@ struct MappingTraits<Kernel::Attrs::Metadata> {
MD.mWorkGroupSizeHint, std::vector<uint32_t>());
YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint,
MD.mVecTypeHint, std::string());
YIO.mapOptional(Kernel::Attrs::Key::RuntimeHandle, MD.mRuntimeHandle,
std::string());
}
};

View File

@ -186,6 +186,10 @@ void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
Pass *createAMDGPUFunctionInliningPass();
void initializeAMDGPUInlinerPass(PassRegistry&);
ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
Target &getTheAMDGPUTarget();
Target &getTheGCNTarget();

View File

@ -0,0 +1,98 @@
//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// \file
// \brief This post-linking pass replaces the function pointer of enqueued
// block kernel with a global variable (runtime handle) and adds
// "runtime-handle" attribute to the enqueued block kernel.
//
// In LLVM CodeGen the runtime-handle metadata will be translated to
// RuntimeHandle metadata in code object. Runtime allocates a global buffer
// for each kernel with RuntimeHandel metadata and saves the kernel address
// required for the AQL packet into the buffer. __enqueue_kernel function
// in device library knows that the invoke function pointer in the block
// literal is actually runtime handle and loads the kernel address from it
// and put it into AQL packet for dispatching.
//
// This cannot be done in FE since FE cannot create a unique global variable
// with external linkage across LLVM modules. The global variable with internal
// linkage does not work since optimization passes will try to replace loads
// of the global variable with its initialization value.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
using namespace llvm;
namespace {
/// \brief Lower enqueued blocks.
class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
public:
static char ID;
explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
private:
bool runOnModule(Module &M) override;
};
} // end anonymous namespace
char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
AMDGPUOpenCLEnqueuedBlockLowering::ID;
INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
"Lower OpenCL enqueued blocks", false, false)
ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
return new AMDGPUOpenCLEnqueuedBlockLowering();
}
bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
auto &C = M.getContext();
auto AS = AMDGPU::getAMDGPUAS(M);
bool Changed = false;
for (auto &F : M.functions()) {
if (F.hasFnAttribute("enqueued-block")) {
if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
!isa<ConstantExpr>(*F.user_begin()) ||
!isa<ConstantExpr>(*F.user_begin()->user_begin())) {
continue;
}
auto *BitCast = cast<ConstantExpr>(*F.user_begin());
auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
auto *GV = new GlobalVariable(
M, Type::getInt8Ty(C)->getPointerTo(AS.GLOBAL_ADDRESS),
/*IsConstant=*/true, GlobalValue::ExternalLinkage,
/*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
GlobalValue::NotThreadLocal, AS.GLOBAL_ADDRESS,
/*IsExternallyInitialized=*/true);
DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
AddrCast->replaceAllUsesWith(NewPtr);
F.addFnAttr("runtime-handle", RuntimeHandle);
F.setLinkage(GlobalValue::ExternalLinkage);
Changed = true;
}
}
return Changed;
}

View File

@ -161,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@ -610,6 +611,9 @@ void AMDGPUPassConfig::addIRPasses() {
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
addPass(createAMDGPUOpenCLImageTypeLoweringPass());
// Replace OpenCL enqueued block function pointers with global variables.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createInferAddressSpacesPass());
addPass(createAMDGPUPromoteAlloca());

View File

@ -39,6 +39,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUOpenCLImageTypeLoweringPass.cpp
AMDGPUPromoteAlloca.cpp
AMDGPURegAsmNames.inc.cpp

View File

@ -244,6 +244,10 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
}
if (Func.hasFnAttribute("runtime-handle")) {
Attrs.mRuntimeHandle =
Func.getFnAttribute("runtime-handle").getValueAsString().str();
}
}
void MetadataStreamer::emitKernelArgs(const Function &Func) {

View File

@ -14,6 +14,8 @@
%struct.B = type { i32 addrspace(1)*}
%opencl.clk_event_t = type opaque
@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
; CHECK: ---
; CHECK: Version: [ 1, 0 ]
; CHECK: Printf:
@ -1197,6 +1199,44 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
ret void
}
; CHECK: - Name: __test_block_invoke_kernel
; CHECK-NEXT: Language: OpenCL C
; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
; CHECK-NEXT: Attrs:
; CHECK-NEXT: RuntimeHandle: __test_block_invoke_kernel_runtime_handle
; CHECK-NEXT: Args:
; CHECK-NEXT: - Size: 25
; CHECK-NEXT: Align: 1
; CHECK-NEXT: ValueKind: ByValue
; CHECK-NEXT: ValueType: Struct
; CHECK-NEXT: AccQual: Default
; CHECK-NEXT: TypeName: __block_literal
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
define amdgpu_kernel void @__test_block_invoke_kernel(
<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #1
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110
!kernel_arg_base_type !110 !kernel_arg_type_qual !4 {
ret void
}
attributes #1 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
!llvm.printf.fmts = !{!100, !101}
!1 = !{i32 0}
@ -1250,13 +1290,14 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
!94 = !{!"", !"", !"", !"", !"", !"", !""}
!100 = !{!"1:1:4:%d\5Cn"}
!101 = !{!"2:1:8:%g\5Cn"}
!110 = !{!"__block_literal"}
; NOTES: Displaying notes found at file offset 0x{{[0-9]+}}
; NOTES-NEXT: Owner Data size Description
; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001)
; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003)
; GFX700: AMD 0x00008b0a Unknown note type: (0x0000000a)
; GFX800: AMD 0x00008e6e Unknown note type: (0x0000000a)
; GFX900: AMD 0x00008b0a Unknown note type: (0x0000000a)
; GFX700: AMD 0x00008f64 Unknown note type: (0x0000000a)
; GFX800: AMD 0x000092e4 Unknown note type: (0x0000000a)
; GFX900: AMD 0x00008f64 Unknown note type: (0x0000000a)
; PARSER: AMDGPU Code Object Metadata Parser Test: PASS

View File

@ -0,0 +1,92 @@
; RUN: opt -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
; CHECK: @__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
; CHECK: @__test_block_invoke_2_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn-amdhsa-amd-opencl"
%struct.ndrange_t = type { i32 }
%opencl.queue_t = type opaque
define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
%block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
%tmp = alloca %struct.ndrange_t, align 4
%block2 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, align 8
%tmp3 = alloca %struct.ndrange_t, align 4
%block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 0
store i32 25, i32* %block.size, align 8
%block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 1
store i32 8, i32* %block.align, align 4
%block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 2
store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 8
%block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 3
store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured, align 8
%block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 4
store i8 %b, i8* %block.captured1, align 8
%tmp1 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block to void ()*
%tmp2 = bitcast void ()* %tmp1 to i8*
%tmp4 = addrspacecast i8* %tmp2 to i8 addrspace(4)*
%tmp5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp, i8 addrspace(4)* nonnull %tmp4) #2
%block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 0
store i32 41, i32* %block.size4, align 8
%block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 1
store i32 8, i32* %block.align5, align 4
%block.invoke6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 2
store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)* @__test_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke6, align 8
%block.captured7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 3
store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured7, align 8
%block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 6
store i8 %b, i8* %block.captured8, align 8
%block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 4
store i64 addrspace(1)* %c, i64 addrspace(1)** %block.captured9, align 8
%block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 5
store i64 %d, i64* %block.captured10, align 8
%tmp6 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2 to void ()*
%tmp7 = bitcast void ()* %tmp6 to i8*
%tmp8 = addrspacecast i8* %tmp7 to i8 addrspace(4)*
%tmp9 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp3, i8 addrspace(4)* nonnull %tmp8) #2
ret void
}
; CHECK: define amdgpu_kernel void @__test_block_invoke_kernel({{.*}}) #[[AT1:[0-9]+]]
define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #0
!kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
entry:
%.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 3
%.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 4
store i8 %.fca.4.extract, i8 addrspace(1)* %.fca.3.extract, align 1
ret void
}
declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t*, i8 addrspace(4)*) local_unnamed_addr
; CHECK: define amdgpu_kernel void @__test_block_invoke_2_kernel({{.*}}) #[[AT2:[0-9]+]]
define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*,
i64 addrspace(1)*, i64, i8 }> %arg) #0 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15
!kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
entry:
%.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 3
%.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 4
%.fca.5.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 5
%.fca.6.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 6
store i8 %.fca.6.extract, i8 addrspace(1)* %.fca.3.extract, align 1
store i64 %.fca.5.extract, i64 addrspace(1)* %.fca.4.extract, align 8
ret void
}
; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"
attributes #0 = { "enqueued-block" }
!3 = !{i32 1, i32 0, i32 1, i32 0}
!4 = !{!"none", !"none", !"none", !"none"}
!5 = !{!"char*", !"char", !"long*", !"long"}
!6 = !{!"", !"", !"", !""}
!14 = !{i32 0}
!15 = !{!"none"}
!16 = !{!"__block_literal"}
!17 = !{!""}