[Openmp][VE] Libomptarget plugin for NEC SX-Aurora
This patch adds a libomptarget plugin for the NEC SX-Aurora TSUBASA Vector Engine (VE target). The code is largely based on the existing generic-elf plugin and uses the NEC VEO and VEOSINFO libraries for offloading. Differential Revision: https://reviews.llvm.org/D76843
This commit is contained in:
parent
40ef427460
commit
6b9e43c67e
|
@ -15,6 +15,7 @@
|
|||
# libffi : required to launch target kernels given function and argument
|
||||
# pointers.
|
||||
# CUDA : required to control offloading to NVIDIA GPUs.
|
||||
# VEOS : required to control offloading to NEC Aurora.
|
||||
|
||||
include (FindPackageHandleStandardArgs)
|
||||
|
||||
|
@ -162,6 +163,61 @@ find_package_handle_standard_args(
|
|||
mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
|
||||
|
||||
################################################################################
|
||||
# Looking for VEO...
|
||||
################################################################################
|
||||
|
||||
find_path (
|
||||
LIBOMPTARGET_DEP_VEO_INCLUDE_DIR
|
||||
NAMES
|
||||
ve_offload.h
|
||||
PATHS
|
||||
/usr/include
|
||||
/usr/local/include
|
||||
/opt/local/include
|
||||
/sw/include
|
||||
/opt/nec/ve/veos/include
|
||||
ENV CPATH
|
||||
PATH_SUFFIXES
|
||||
libveo)
|
||||
|
||||
find_library (
|
||||
LIBOMPTARGET_DEP_VEO_LIBRARIES
|
||||
NAMES
|
||||
veo
|
||||
PATHS
|
||||
/usr/lib
|
||||
/usr/local/lib
|
||||
/opt/local/lib
|
||||
/sw/lib
|
||||
/opt/nec/ve/veos/lib64
|
||||
ENV LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH)
|
||||
|
||||
find_library(
|
||||
LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES
|
||||
NAMES
|
||||
veosinfo
|
||||
PATHS
|
||||
/usr/lib
|
||||
/usr/local/lib
|
||||
/opt/local/lib
|
||||
/sw/lib
|
||||
/opt/nec/ve/veos/lib64
|
||||
ENV LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH)
|
||||
|
||||
set(LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS ${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR})
|
||||
find_package_handle_standard_args(
|
||||
LIBOMPTARGET_DEP_VEO
|
||||
DEFAULT_MSG
|
||||
LIBOMPTARGET_DEP_VEO_LIBRARIES
|
||||
LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES
|
||||
LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS)
|
||||
|
||||
mark_as_advanced(
|
||||
LIBOMPTARGET_DEP_VEO_FOUND
|
||||
LIBOMPTARGET_DEP_VEO_INCLUDE_DIRS)
|
||||
|
||||
# Looking for CUDA libdevice subdirectory
|
||||
#
|
||||
# Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work
|
||||
|
|
|
@ -69,6 +69,7 @@ add_subdirectory(aarch64)
|
|||
add_subdirectory(cuda)
|
||||
add_subdirectory(ppc64)
|
||||
add_subdirectory(ppc64le)
|
||||
add_subdirectory(ve)
|
||||
add_subdirectory(x86_64)
|
||||
|
||||
# Make sure the parent scope can see the plugins that will be created.
|
||||
|
|
|
@ -71,3 +71,41 @@ static inline int32_t elf_check_machine(__tgt_device_image *image,
|
|||
elf_end(e);
|
||||
return MachineID == target_id;
|
||||
}
|
||||
|
||||
static inline int32_t elf_is_dynamic(__tgt_device_image *image) {
|
||||
|
||||
char *img_begin = (char *)image->ImageStart;
|
||||
char *img_end = (char *)image->ImageEnd;
|
||||
size_t img_size = img_end - img_begin;
|
||||
|
||||
// Obtain elf handler
|
||||
Elf *e = elf_memory(img_begin, img_size);
|
||||
if (!e) {
|
||||
DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
|
||||
return 0;
|
||||
}
|
||||
|
||||
Elf64_Ehdr *eh64 = elf64_getehdr(e);
|
||||
Elf32_Ehdr *eh32 = elf32_getehdr(e);
|
||||
|
||||
if (!eh64 && !eh32) {
|
||||
DP("Unable to get machine ID from ELF file!\n");
|
||||
elf_end(e);
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint16_t Type;
|
||||
if (eh64 && !eh32)
|
||||
Type = eh64->e_type;
|
||||
else if (eh32 && !eh64)
|
||||
Type = eh32->e_type;
|
||||
else {
|
||||
DP("Ambiguous ELF header!\n");
|
||||
elf_end(e);
|
||||
return 0;
|
||||
}
|
||||
|
||||
elf_end(e);
|
||||
DP("ELF Type: %d\n", Type);
|
||||
return Type == ET_DYN;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Build a plugin for a NEC Aurora machine if available. (Can also run on host)
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
|
||||
if(${LIBOMPTARGET_DEP_VEO_FOUND})
|
||||
libomptarget_say("Building SX-Aurora VE offloading plugin.")
|
||||
set(additional_libs "")
|
||||
set(additional_libs ${LIBOMPTARGET_DEP_VEO_LIBRARIES}
|
||||
${LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES}
|
||||
${additional_libs})
|
||||
|
||||
set(tmachine_name "ve")
|
||||
set(tmachine_libname "ve")
|
||||
set(tmachine_triple "ve-unknown-linux-unknown")
|
||||
set(elf_machine_id 251)
|
||||
|
||||
include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
|
||||
include_directories(${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR})
|
||||
|
||||
|
||||
# Define macro to be used as prefix of the runtime messages for this target.
|
||||
add_definitions("-DTARGET_NAME=${tmachine_name}")
|
||||
|
||||
# Define macro with the ELF ID for this target.
|
||||
add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
|
||||
|
||||
add_library("omptarget.rtl.${tmachine_libname}" SHARED
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp)
|
||||
|
||||
# Install plugin under the lib destination folder.
|
||||
install(TARGETS "omptarget.rtl.${tmachine_libname}"
|
||||
LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX})
|
||||
|
||||
target_link_libraries(
|
||||
"omptarget.rtl.${tmachine_libname}"
|
||||
${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
|
||||
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
|
||||
${additional_libs}
|
||||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports -Wl,-z,defs")
|
||||
|
||||
# Report to the parent scope that we are building a plugin.
|
||||
set(LIBOMPTARGET_SYSTEM_TARGETS
|
||||
"${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
|
||||
else()
|
||||
libomptarget_say("Not building nec-aurora plugin: libveo or libveosinfo not found.")
|
||||
endif()
|
|
@ -0,0 +1,464 @@
|
|||
//===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// RTL for NEC Aurora TSUBASA machines
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptargetplugin.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <list>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <sys/stat.h>
|
||||
#include <ve_offload.h>
|
||||
#include <vector>
|
||||
#include <veosinfo/veosinfo.h>
|
||||
|
||||
#ifndef TARGET_ELF_ID
|
||||
#define TARGET_ELF_ID 0
|
||||
#endif
|
||||
|
||||
#ifdef OMPTARGET_DEBUG
|
||||
static int DebugLevel = 0;
|
||||
|
||||
#define GETNAME2(name) #name
|
||||
#define GETNAME(name) GETNAME2(name)
|
||||
#define DP(...) \
|
||||
do { \
|
||||
if (DebugLevel > 0) { \
|
||||
DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
|
||||
} \
|
||||
} while (false)
|
||||
#else // OMPTARGET_DEBUG
|
||||
#define DP(...) \
|
||||
{}
|
||||
#endif // OMPTARGET_DEBUG
|
||||
|
||||
#include "../../common/elf_common.c"
|
||||
|
||||
struct DynLibTy {
|
||||
char *FileName;
|
||||
uint64_t VeoLibHandle;
|
||||
};
|
||||
|
||||
/// Keep entries table per device.
|
||||
struct FuncOrGblEntryTy {
|
||||
__tgt_target_table Table;
|
||||
std::vector<__tgt_offload_entry> Entries;
|
||||
};
|
||||
|
||||
class RTLDeviceInfoTy {
|
||||
std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry;
|
||||
|
||||
public:
|
||||
std::vector<struct veo_proc_handle *> ProcHandles;
|
||||
std::vector<struct veo_thr_ctxt *> Contexts;
|
||||
std::vector<uint64_t> LibraryHandles;
|
||||
std::list<DynLibTy> DynLibs;
|
||||
// Maps OpenMP device Ids to Ve nodeids
|
||||
std::vector<int> NodeIds;
|
||||
|
||||
void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle,
|
||||
__tgt_offload_entry *HostBegin,
|
||||
__tgt_offload_entry *HostEnd) {
|
||||
FuncOrGblEntry[device_id].emplace_back();
|
||||
std::vector<__tgt_offload_entry> &T =
|
||||
FuncOrGblEntry[device_id].back().Entries;
|
||||
T.clear();
|
||||
for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) {
|
||||
char *SymbolName = i->name;
|
||||
// we have not enough access to the target memory to conveniently parse
|
||||
// the offload table there so we need to lookup every symbol with the host
|
||||
// table
|
||||
DP("Looking up symbol: %s\n", SymbolName);
|
||||
uint64_t SymbolTargetAddr =
|
||||
veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName);
|
||||
__tgt_offload_entry Entry;
|
||||
|
||||
if (!SymbolTargetAddr) {
|
||||
DP("Symbol %s not found in target image\n", SymbolName);
|
||||
Entry = {NULL, NULL, 0, 0, 0};
|
||||
} else {
|
||||
DP("Found symbol %s successfully in target image (addr: %p)\n",
|
||||
SymbolName, reinterpret_cast<void *>(SymbolTargetAddr));
|
||||
Entry = { reinterpret_cast<void *>(SymbolTargetAddr),
|
||||
i->name,
|
||||
i->size,
|
||||
i->flags,
|
||||
0 };
|
||||
}
|
||||
|
||||
T.push_back(Entry);
|
||||
}
|
||||
|
||||
FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front();
|
||||
FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1;
|
||||
}
|
||||
|
||||
__tgt_target_table *getOffloadTable(int32_t device_id) {
|
||||
return &FuncOrGblEntry[device_id].back().Table;
|
||||
}
|
||||
|
||||
RTLDeviceInfoTy() {
|
||||
#ifdef OMPTARGET_DEBUG
|
||||
if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
|
||||
DebugLevel = std::stoi(envStr);
|
||||
}
|
||||
#endif // OMPTARGET_DEBUG
|
||||
|
||||
struct ve_nodeinfo node_info;
|
||||
ve_node_info(&node_info);
|
||||
|
||||
// Build a predictable mapping between VE node ids and OpenMP device ids.
|
||||
// This is necessary, because nodes can be missing or offline and (active)
|
||||
// node ids are thus not consecutive. The entries in ve_nodeinfo may also
|
||||
// not be in the order of their node ids.
|
||||
for (int i = 0; i < node_info.total_node_count; ++i) {
|
||||
if (node_info.status[i] == 0) {
|
||||
NodeIds.push_back(node_info.nodeid[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Because the entries in ve_nodeinfo may not be in the order of their node
|
||||
// ids, we sort NodeIds to get a predictable mapping.
|
||||
std::sort(NodeIds.begin(), NodeIds.end());
|
||||
|
||||
int NumDevices = NodeIds.size();
|
||||
DP("Found %i VE devices\n", NumDevices);
|
||||
ProcHandles.resize(NumDevices, NULL);
|
||||
Contexts.resize(NumDevices, NULL);
|
||||
FuncOrGblEntry.resize(NumDevices);
|
||||
LibraryHandles.resize(NumDevices);
|
||||
}
|
||||
|
||||
~RTLDeviceInfoTy() {
|
||||
for (auto &ctx : Contexts) {
|
||||
if (ctx != NULL) {
|
||||
if (veo_context_close(ctx) != 0) {
|
||||
DP("Failed to close VEO context.\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &hdl : ProcHandles) {
|
||||
if (hdl != NULL) {
|
||||
veo_proc_destroy(hdl);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &lib : DynLibs) {
|
||||
if (lib.FileName) {
|
||||
remove(lib.FileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static RTLDeviceInfoTy DeviceInfo;
|
||||
|
||||
static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr,
|
||||
struct veo_args *args, uint64_t *RetVal) {
|
||||
DP("Running function with entry point %p\n",
|
||||
reinterpret_cast<void *>(FuncAddr));
|
||||
uint64_t RequestHandle =
|
||||
veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args);
|
||||
if (RequestHandle == VEO_REQUEST_ID_INVALID) {
|
||||
DP("Execution of entry point %p failed\n",
|
||||
reinterpret_cast<void *>(FuncAddr));
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
|
||||
DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n",
|
||||
reinterpret_cast<void *>(FuncAddr), RequestHandle);
|
||||
|
||||
int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle,
|
||||
RetVal);
|
||||
if (ret != 0) {
|
||||
DP("Waiting for entry point %p failed (Error code %d)\n",
|
||||
reinterpret_cast<void *>(FuncAddr), ret);
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
// Return the number of available devices of the type supported by the
|
||||
// target RTL.
|
||||
int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); }
|
||||
|
||||
// Return an integer different from zero if the provided device image can be
|
||||
// supported by the runtime. The functionality is similar to comparing the
|
||||
// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
|
||||
// lightweight query to determine if the RTL is suitable for an image without
|
||||
// having to load the library, which can be expensive.
|
||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
|
||||
#if TARGET_ELF_ID < 1
|
||||
return 0;
|
||||
#else
|
||||
return elf_check_machine(Image, TARGET_ELF_ID);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Initialize the specified device. In case of success return 0; otherwise
|
||||
// return an error code.
|
||||
int32_t __tgt_rtl_init_device(int32_t ID) {
|
||||
DP("Available VEO version: %i\n", veo_api_version());
|
||||
|
||||
// At the moment we do not really initialize (i.e. create a process or
|
||||
// context on) the device here, but in "__tgt_rtl_load_binary".
|
||||
// The reason for this is, that, when we create a process for a statically
|
||||
// linked binary, the VEO api needs us to already supply the binary (but we
|
||||
// can load a dynamically linked binary later, after we create the process).
|
||||
// At this stage, we cannot check if we have a dynamically or statically
|
||||
// linked binary so we defer process creation until we know.
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
// Pass an executable image section described by image to the specified
|
||||
// device and prepare an address table of target entities. In case of error,
|
||||
// return NULL. Otherwise, return a pointer to the built address table.
|
||||
// Individual entries in the table may also be NULL, when the corresponding
|
||||
// offload region is not supported on the target device.
|
||||
__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
|
||||
__tgt_device_image *Image) {
|
||||
DP("Dev %d: load binary from " DPxMOD " image\n", ID,
|
||||
DPxPTR(Image->ImageStart));
|
||||
|
||||
assert(ID >= 0 && "bad dev id");
|
||||
|
||||
size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
|
||||
size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
|
||||
DP("Expecting to have %zd entries defined.\n", NumEntries);
|
||||
|
||||
// load dynamic library and get the entry points. We use the dl library
|
||||
// to do the loading of the library, but we could do it directly to avoid the
|
||||
// dump to the temporary file.
|
||||
//
|
||||
// 1) Create tmp file with the library contents.
|
||||
// 2) Use dlopen to load the file and dlsym to retrieve the symbols.
|
||||
char tmp_name[] = "/tmp/tmpfile_XXXXXX";
|
||||
int tmp_fd = mkstemp(tmp_name);
|
||||
|
||||
if (tmp_fd == -1) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
FILE *ftmp = fdopen(tmp_fd, "wb");
|
||||
|
||||
if (!ftmp) {
|
||||
DP("fdopen() for %s failed. Could not write target image\n", tmp_name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fwrite(Image->ImageStart, ImageSize, 1, ftmp);
|
||||
|
||||
// at least for the static case we need to change the permissions
|
||||
chmod(tmp_name, 0700);
|
||||
|
||||
DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize);
|
||||
|
||||
fclose(ftmp);
|
||||
|
||||
// See comment in "__tgt_rtl_init_device"
|
||||
bool is_dyn = true;
|
||||
if (DeviceInfo.ProcHandles[ID] == NULL) {
|
||||
struct veo_proc_handle *proc_handle;
|
||||
is_dyn = elf_is_dynamic(Image);
|
||||
// If we have a dynamically linked image, we create the process handle, then
|
||||
// the thread, and then load the image.
|
||||
// If we have a statically linked image, we need to create the process
|
||||
// handle and load the image at the same time with veo_proc_create_static().
|
||||
if (is_dyn) {
|
||||
proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
|
||||
if (!proc_handle) {
|
||||
DP("veo_proc_create() failed for device %d\n", ID);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name);
|
||||
if (!proc_handle) {
|
||||
DP("veo_proc_create_static() failed for device %d, image=%s\n", ID,
|
||||
tmp_name);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
DeviceInfo.ProcHandles[ID] = proc_handle;
|
||||
}
|
||||
|
||||
if (DeviceInfo.Contexts[ID] == NULL) {
|
||||
struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]);
|
||||
|
||||
if (!ctx) {
|
||||
DP("veo_context_open() failed: %s\n", std::strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
DeviceInfo.Contexts[ID] = ctx;
|
||||
}
|
||||
|
||||
DP("Aurora device successfully initialized with loaded binary: "
|
||||
"proc_handle=%p, ctx=%p\n",
|
||||
DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]);
|
||||
|
||||
uint64_t LibHandle = 0UL;
|
||||
if (is_dyn) {
|
||||
LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name);
|
||||
|
||||
if (!LibHandle) {
|
||||
DP("veo_load_library() failed: LibHandle=%" PRIu64
|
||||
" Name=%s. Set env VEORUN_BIN for static linked target code.\n",
|
||||
LibHandle, tmp_name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
DP("Successfully loaded library dynamically\n");
|
||||
} else {
|
||||
DP("Symbol table is expected to have been created by "
|
||||
"veo_create_proc_static()\n");
|
||||
}
|
||||
|
||||
DynLibTy Lib = {tmp_name, LibHandle};
|
||||
DeviceInfo.DynLibs.push_back(Lib);
|
||||
DeviceInfo.LibraryHandles[ID] = LibHandle;
|
||||
|
||||
DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin,
|
||||
Image->EntriesEnd);
|
||||
|
||||
return DeviceInfo.getOffloadTable(ID);
|
||||
}
|
||||
|
||||
// Allocate data on the particular target device, of the specified size.
|
||||
// HostPtr is a address of the host data the allocated target data
|
||||
// will be associated with (HostPtr may be NULL if it is not known at
|
||||
// allocation time, like for example it would be for target data that
|
||||
// is allocated by omp_target_alloc() API). Return address of the
|
||||
// allocated data on the target that will be used by libomptarget.so to
|
||||
// initialize the target data mapping structures. These addresses are
|
||||
// used to generate a table of target variables to pass to
|
||||
// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
|
||||
// case an error occurred on the target device.
|
||||
void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) {
|
||||
int ret;
|
||||
uint64_t addr;
|
||||
|
||||
if (DeviceInfo.ProcHandles[ID] == NULL) {
|
||||
struct veo_proc_handle *proc_handle;
|
||||
proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
|
||||
if (!proc_handle) {
|
||||
DP("veo_proc_create() failed for device %d\n", ID);
|
||||
return NULL;
|
||||
}
|
||||
DeviceInfo.ProcHandles[ID] = proc_handle;
|
||||
DP("Aurora device successfully initialized: proc_handle=%p", proc_handle);
|
||||
}
|
||||
|
||||
ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size);
|
||||
DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n",
|
||||
ID, reinterpret_cast<void *>(addr), Size);
|
||||
if (ret != 0) {
|
||||
DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n",
|
||||
ID, reinterpret_cast<void *>(addr), Size, ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return reinterpret_cast<void *>(addr);
|
||||
}
|
||||
|
||||
// Pass the data content to the target device using the target address.
|
||||
// In case of success, return zero. Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
|
||||
int64_t Size) {
|
||||
int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr,
|
||||
HostPtr, (size_t)Size);
|
||||
if (ret != 0) {
|
||||
DP("veo_write_mem() failed with error code %d\n", ret);
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
// Retrieve the data content from the target device using its address.
|
||||
// In case of success, return zero. Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
|
||||
int64_t Size) {
|
||||
int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr,
|
||||
(uint64_t)TargetPtr, Size);
|
||||
if (ret != 0) {
|
||||
DP("veo_read_mem() failed with error code %d\n", ret);
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
// De-allocate the data referenced by target ptr on the device. In case of
|
||||
// success, return zero. Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) {
|
||||
int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
|
||||
|
||||
if (ret != 0) {
|
||||
DP("veo_free_mem() failed with error code %d\n", ret);
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
// Similar to __tgt_rtl_run_target_region, but additionally specify the
|
||||
// number of teams to be created and a number of threads in each team.
|
||||
int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
|
||||
ptrdiff_t *Offsets, int32_t NumArgs,
|
||||
int32_t NumTeams, int32_t ThreadLimit,
|
||||
uint64_t loop_tripcount) {
|
||||
int ret;
|
||||
|
||||
// ignore team num and thread limit.
|
||||
std::vector<void *> ptrs(NumArgs);
|
||||
|
||||
struct veo_args *TargetArgs;
|
||||
TargetArgs = veo_args_alloc();
|
||||
|
||||
if (TargetArgs == NULL) {
|
||||
DP("Could not allocate VEO args\n");
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < NumArgs; ++i) {
|
||||
ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]);
|
||||
|
||||
if (ret != 0) {
|
||||
DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n",
|
||||
ret, i, Args[i]);
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t RetVal;
|
||||
if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry),
|
||||
TargetArgs, &RetVal) != OFFLOAD_SUCCESS) {
|
||||
veo_args_free(TargetArgs);
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
veo_args_free(TargetArgs);
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
// Transfer control to the offloaded entry Entry on the target device.
|
||||
// Args and Offsets are arrays of NumArgs size of target addresses and
|
||||
// offsets. An offset should be added to the target address before passing it
|
||||
// to the outlined function on device side. In case of success, return zero.
|
||||
// Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
|
||||
ptrdiff_t *Offsets, int32_t NumArgs) {
|
||||
return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1,
|
||||
1, 0);
|
||||
}
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
// List of all plugins that can support offloading.
|
||||
static const char *RTLNames[] = {
|
||||
/* SX-Aurora VE target */ "libomptarget.rtl.ve.so",
|
||||
/* PowerPC target */ "libomptarget.rtl.ppc64.so",
|
||||
/* x86_64 target */ "libomptarget.rtl.x86_64.so",
|
||||
/* CUDA target */ "libomptarget.rtl.cuda.so",
|
||||
|
|
Loading…
Reference in New Issue