[CUDA] Postprocess bitcode linked in during device-side CUDA compilation.

Link in and internalize the symbols we need from supplied bitcode library.

Differential Revision: http://reviews.llvm.org/D11664

llvm-svn: 247317
This commit is contained in:
Artem Belevich 2015-09-10 18:24:23 +00:00
parent 7ad7ae1fac
commit 7cb25c9b69
6 changed files with 106 additions and 1 deletions

View File

@ -166,6 +166,7 @@ LANGOPT(OpenMPUseTLS , 1, 0, "Use TLS for threadprivates or runtime calls")
LANGOPT(CUDAIsDevice , 1, 0, "Compiling for CUDA device")
LANGOPT(CUDAAllowHostCallsFromHostDevice, 1, 0, "Allow host device functions to call host functions")
LANGOPT(CUDADisableTargetCallChecks, 1, 0, "Disable checks for call targets (host, device, etc.)")
LANGOPT(CUDAUsesLibDevice , 1, 0, "Selectively link and internalize bitcode.")
LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators")
LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")

View File

@ -659,6 +659,8 @@ def fcuda_disable_target_call_checks : Flag<["-"],
HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">;
def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
HelpText<"Incorporate CUDA device-side binary into host object file.">;
def fcuda_uses_libdevice : Flag<["-"], "fcuda-uses-libdevice">,
HelpText<"Selectively link and internalize bitcode.">;
} // let Flags = [CC1Option]

View File

@ -159,7 +159,12 @@ namespace clang {
if (LinkModule) {
if (Linker::LinkModules(
M, LinkModule.get(),
[=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); }))
[=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); },
(LangOpts.CUDA && LangOpts.CUDAIsDevice &&
LangOpts.CUDAUsesLibDevice)
? (Linker::Flags::LinkOnlyNeeded |
Linker::Flags::InternalizeLinkedSymbols)
: Linker::Flags::None))
return;
}

View File

@ -1406,6 +1406,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
if (Args.hasArg(OPT_fcuda_is_device))
Opts.CUDAIsDevice = 1;
if (Args.hasArg(OPT_fcuda_uses_libdevice))
Opts.CUDAUsesLibDevice = 1;
if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device))
Opts.CUDAAllowHostCallsFromHostDevice = 1;

View File

@ -0,0 +1,38 @@
; Simple bit of IR to mimic CUDA's libdevice. We want to be
; able to link with it and we need to make sure all __nvvm_reflect
; calls are eliminated by the time PTX has been produced.
target triple = "nvptx-unknown-cuda"
declare i32 @__nvvm_reflect(i8*)
@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00"
define void @unused_subfunc(float %a) {
ret void
}
define void @used_subfunc(float %a) {
ret void
}
define float @_Z17device_mul_or_addff(float %a, float %b) {
%reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%cmp = icmp ne i32 %reflect, 0
br i1 %cmp, label %use_mul, label %use_add
use_mul:
%ret1 = fmul float %a, %b
br label %exit
use_add:
%ret2 = fadd float %a, %b
br label %exit
exit:
%ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
call void @used_subfunc(float %ret)
ret float %ret
}

View File

@ -0,0 +1,56 @@
// Test for linking with CUDA's libdevice as outlined in
// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice
//
// REQUIRES: nvptx-registered-target
//
// Prepare bitcode file to link with
// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \
// RUN: %S/Inputs/device-code.ll
//
// Make sure function in device-code gets linked in and internalized.
// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
// RUN: -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \
// RUN: -disable-llvm-passes -o - %s \
// RUN: | FileCheck %s -check-prefix CHECK-IR
//
// Make sure function in device-code gets linked but is not internalized
// without -fcuda-uses-libdevice
// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
// RUN: -mlink-bitcode-file %t.bc -emit-llvm \
// RUN: -disable-llvm-passes -o - %s \
// RUN: | FileCheck %s -check-prefix CHECK-IR-NLD
//
// Make sure NVVMReflect pass is enabled in NVPTX back-end.
// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
// RUN: -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o /dev/null %s \
// RUN: -backend-option -debug-pass=Structure 2>&1 \
// RUN: | FileCheck %s -check-prefix CHECK-REFLECT
#include "Inputs/cuda.h"
__device__ float device_mul_or_add(float a, float b);
extern "C" __device__ double __nv_sin(double x);
extern "C" __device__ double __nv_exp(double x);
// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf(
// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf(
__device__ void should_not_be_internalized(float *data) {}
// Make sure kernel call has not been internalized.
// CHECK-IR-LABEL: define void @_Z6kernelPfS_
// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_(
__global__ __attribute__((used)) void kernel(float *out, float *in) {
*out = device_mul_or_add(in[0], in[1]);
*out += __nv_exp(__nv_sin(*out));
should_not_be_internalized(out);
}
// Make sure device_mul_or_add() is present in IR, is internal and
// calls __nvvm_reflect().
// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff(
// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff(
// CHECK-IR: call i32 @__nvvm_reflect
// CHECK-IR: ret float
// Verify that NVVMReflect pass is among the passes run by NVPTX back-end.
// CHECK-REFLECT: Replace occurrences of __nvvm_reflect() calls with 0/1