[OpenMP][libomptarget] Enable the compilation of multiple bc libraries for runtime inlining

Summary:
Different NVIDIA GPUs support different compute capabilities. To enable the inlining of runtime functions and the best performance on different generations of NVIDIA GPUs, a bc library for each compute capability needs to be compiled. The same compiler build will then be usable in conjunction with multiple generations of NVIDIA GPUs.
To differentiate between versions of the same bc lib, the output file name will contain the compute capability ID.
Depends on D14254

Reviewers: Hahnfeld, hfinkel, carlo.bertolli, caomhin, ABataev, grokos

Reviewed By: Hahnfeld, grokos

Subscribers: guansong, mgorny, openmp-commits

Differential Revision: https://reviews.llvm.org/D41724

llvm-svn: 324904
This commit is contained in:
Gheorghe-Teodor Bercea 2018-02-12 16:45:20 +00:00
parent 7dc0f1ec45
commit d5ae4e6501
2 changed files with 52 additions and 42 deletions

View File

@ -280,10 +280,10 @@ Options for ``NVPTX device RTL``
compatible with NVCC, this option can be use to pass to NVCC a valid compiler
to avoid the error.
**LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY** = ``35``
CUDA compute capability that should be supported by the NVPTX device RTL. E.g.
for compute capability 6.0, the option "60" should be used. Compute capability
3.5 is the minimum required.
**LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES** = ``35``
List of CUDA compute capabilities that should be supported by the NVPTX
device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60,70"
should be used. Compute capability 3.5 is the minimum required.
**LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON``
Enable printing of debug messages from the NVPTX device RTL.

View File

@ -60,9 +60,18 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
# Get the compute capability the user requested or use SM_35 by default.
# SM_35 is what clang uses by default.
set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY 35 CACHE STRING
"CUDA Compute Capability to be used to compile the NVPTX device RTL.")
set(CUDA_ARCH -arch sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
set(default_capabilities 35)
if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
endif()
set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
"List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
foreach(sm ${nvptx_sm_list})
set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
endforeach()
# Activate RTL message dumps if requested by the user.
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
@ -152,46 +161,47 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
# Get the compute capability the user requested or use SM_35 by default.
set(CUDA_ARCH "")
set(CUDA_ARCH --cuda-gpu-arch=sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
foreach(sm ${nvptx_sm_list})
set(CUDA_ARCH --cuda-gpu-arch=sm_${sm})
# Compile cuda files to bitcode.
set(bc_files "")
foreach(src ${cuda_src_files})
get_filename_component(infile ${src} ABSOLUTE)
get_filename_component(outfile ${src} NAME)
# Compile cuda files to bitcode.
set(bc_files "")
foreach(src ${cuda_src_files})
get_filename_component(infile ${src} ABSOLUTE)
get_filename_component(outfile ${src} NAME)
add_custom_command(OUTPUT ${outfile}.bc
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES}
-c ${infile} -o ${outfile}.bc
DEPENDS ${infile}
IMPLICIT_DEPENDS CXX ${infile}
COMMENT "Building LLVM bitcode ${outfile}.bc"
VERBATIM
add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES}
-c ${infile} -o ${outfile}-sm_${sm}.bc
DEPENDS ${infile}
IMPLICIT_DEPENDS CXX ${infile}
COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
VERBATIM
)
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
list(APPEND bc_files ${outfile}-sm_${sm}.bc)
endforeach()
# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
DEPENDS ${bc_files}
COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
)
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}.bc)
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
list(APPEND bc_files ${outfile}.bc)
add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
# Copy library to destination.
add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
$<TARGET_FILE_DIR:omptarget-nvptx>)
# Install device RTL under the lib destination folder.
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "lib")
endforeach()
# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc ${bc_files}
DEPENDS ${bc_files}
COMMENT "Linking LLVM bitcode libomptarget-nvptx.bc"
)
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx.bc)
add_custom_target(omptarget-nvptx-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc)
# Copy library to destination.
add_custom_command(TARGET omptarget-nvptx-bc POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
$<TARGET_FILE_DIR:omptarget-nvptx>)
# Install device RTL under the lib destination folder.
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc DESTINATION "lib")
endif()
endif()