[OpenMP] Changes in the plugin interface

This patch chagnes the plugin interface so that:
1) future plugins can take advantage of systems with shared CPU/device storage
2) instead of using base addresses, target regions are launched by providing target addresseds and base offsets explicitly.

Differential revision: https://reviews.llvm.org/D33028
 

llvm-svn: 302663
This commit is contained in:
George Rokos 2017-05-10 14:12:36 +00:00
parent dc1ed12015
commit 1546d31924
5 changed files with 158 additions and 47 deletions

View File

@ -19,7 +19,7 @@
#include <string>
#include <vector>
#include "omptarget.h"
#include "omptargetplugin.h"
#ifndef TARGET_NAME
#define TARGET_NAME CUDA
@ -473,7 +473,7 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
return DeviceInfo.getOffloadEntriesTable(device_id);
}
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) {
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
if (size == 0) {
return NULL;
}
@ -559,8 +559,8 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
}
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit,
uint64_t loop_tripcount) {
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
int32_t thread_limit, uint64_t loop_tripcount) {
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
@ -571,9 +571,12 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
// All args are references.
std::vector<void *> args(arg_num);
std::vector<void *> ptrs(arg_num);
for (int32_t i = 0; i < arg_num; ++i)
args[i] = &tgt_args[i];
for (int32_t i = 0; i < arg_num; ++i) {
ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
args[i] = &ptrs[i];
}
KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
@ -678,12 +681,12 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
}
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, int32_t arg_num) {
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
// use one team and the default number of threads.
const int32_t team_num = 1;
const int32_t thread_limit = 0;
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
arg_num, team_num, thread_limit, 0);
tgt_offsets, arg_num, team_num, thread_limit, 0);
}
#ifdef __cplusplus

View File

@ -22,7 +22,7 @@
#include <list>
#include <vector>
#include "omptarget.h"
#include "omptargetplugin.h"
#ifndef TARGET_NAME
#define TARGET_NAME Generic ELF - 64bit
@ -251,7 +251,7 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
return DeviceInfo.getOffloadEntriesTable(device_id);
}
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) {
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
void *ptr = malloc(size);
return ptr;
}
@ -274,8 +274,8 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
}
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit,
uint64_t loop_tripcount /*not used*/) {
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
// ignore team num and thread limit.
// Use libffi to launch execution.
@ -284,9 +284,12 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
// All args are references.
std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer);
std::vector<void *> args(arg_num);
std::vector<void *> ptrs(arg_num);
for (int32_t i = 0; i < arg_num; ++i)
args[i] = &tgt_args[i];
for (int32_t i = 0; i < arg_num; ++i) {
ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
args[i] = &ptrs[i];
}
ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num,
&ffi_type_void, &args_types[0]);
@ -303,10 +306,10 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
}
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, int32_t arg_num) {
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
// use one team and one thread.
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
arg_num, 1, 1, 0);
tgt_offsets, arg_num, 1, 1, 0);
}
#ifdef __cplusplus

View File

@ -162,10 +162,11 @@ struct DeviceTy {
int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, int32_t TgtVarsSize);
int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize);
int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount);
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
int32_t ThreadLimit, uint64_t LoopTripCount);
private:
// Call to RTL
@ -181,13 +182,14 @@ struct RTLInfoTy {
typedef int32_t(number_of_devices_ty)();
typedef int32_t(init_device_ty)(int32_t);
typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
typedef void *(data_alloc_ty)(int32_t, int64_t);
typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
typedef int32_t(data_delete_ty)(int32_t, void *);
typedef int32_t(run_region_ty)(int32_t, void *, void **, int32_t);
typedef int32_t(run_team_region_ty)(int32_t, void *, void **, int32_t,
int32_t, int32_t, uint64_t);
typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
int32_t);
typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
int32_t, int32_t, int32_t, uint64_t);
int32_t Idx; // RTL index, index is the number of devices
// of other RTLs that were registered before,
@ -471,7 +473,7 @@ EXTERN void *omp_target_alloc(size_t size, int device_num) {
}
DeviceTy &Device = Devices[device_num];
rc = Device.RTL->data_alloc(Device.RTLDeviceID, size);
rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL);
DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
return rc;
}
@ -861,7 +863,7 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
} else if (Size) {
// If it is not contained and Size > 0 we should create a new entry for it.
IsNew = true;
uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size);
uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
"HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
@ -995,16 +997,17 @@ int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
// Run region on device
int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
int32_t TgtVarsSize) {
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize);
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) {
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
TgtVarsSize);
}
// Run team region on device.
int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount) {
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize,
NumTeams, ThreadLimit, LoopTripCount);
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
int32_t ThreadLimit, uint64_t LoopTripCount) {
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount);
}
////////////////////////////////////////////////////////////////////////////////
@ -2108,6 +2111,7 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
}
std::vector<void *> tgt_args;
std::vector<ptrdiff_t> tgt_offsets;
// List of (first-)private arrays allocated for this target region
std::vector<void *> fpArrays;
@ -2119,16 +2123,18 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
}
void *HstPtrBegin = args[i];
void *HstPtrBase = args_base[i];
void *TgtPtrBase;
void *TgtPtrBegin;
ptrdiff_t TgtBaseOffset;
bool IsLast; // unused.
if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
DP("Forwarding first-private value " DPxMOD " to the target construct\n",
DPxPTR(HstPtrBase));
TgtPtrBase = HstPtrBase;
TgtPtrBegin = HstPtrBase;
TgtBaseOffset = 0;
} else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
// Allocate memory for (first-)private array
void *TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
arg_sizes[i]);
TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
arg_sizes[i], HstPtrBegin);
if (!TgtPtrBegin) {
DP ("Data allocation for %sprivate array " DPxMOD " failed\n",
(arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
@ -2137,8 +2143,8 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
break;
} else {
fpArrays.push_back(TgtPtrBegin);
uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
"%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
arg_sizes[i], DPxPTR(TgtPtrBegin),
@ -2155,24 +2161,29 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
}
}
} else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *),
IsLast, false);
TgtPtrBase = TgtPtrBegin; // no offset for ptrs.
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
false);
TgtBaseOffset = 0; // no offset for ptrs.
DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
"object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
DPxPTR(HstPtrBase));
} else {
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i],
IsLast, false);
uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
false);
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
}
tgt_args.push_back(TgtPtrBase);
tgt_args.push_back(TgtPtrBegin);
tgt_offsets.push_back(TgtBaseOffset);
}
// Push omp handle.
tgt_args.push_back((void *)0);
tgt_offsets.push_back(0);
assert(tgt_args.size() == tgt_offsets.size() &&
"Size mismatch in arguments and offsets");
// Pop loop trip count
uint64_t ltc = Device.loopTripCnt;
@ -2185,10 +2196,11 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
if (IsTeamConstruct) {
rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
&tgt_args[0], tgt_args.size(), team_num, thread_limit, ltc);
&tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
thread_limit, ltc);
} else {
rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
&tgt_args[0], tgt_args.size());
&tgt_args[0], &tgt_offsets[0], tgt_args.size());
}
} else {
DP("Errors occurred while obtaining target arguments, skipping kernel "

View File

@ -16,6 +16,7 @@
#define _OMPTARGET_H_
#include <stdint.h>
#include <stddef.h>
#define OFFLOAD_SUCCESS (0)
#define OFFLOAD_FAIL (~0)

View File

@ -0,0 +1,92 @@
//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines an interface between target independent OpenMP offload
// runtime library libomptarget and target dependent plugin.
//
//===----------------------------------------------------------------------===//
#ifndef _OMPTARGETPLUGIN_H_
#define _OMPTARGETPLUGIN_H_
#include <omptarget.h>
#ifdef __cplusplus
extern "C" {
#endif
// Return the number of available devices of the type supported by the
// target RTL.
int32_t __tgt_rtl_number_of_devices(void);
// Return an integer different from zero if the provided device image can be
// supported by the runtime. The functionality is similar to comparing the
// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
// lightweight query to determine if the RTL is suitable for an image without
// having to load the library, which can be expensive.
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
// Initialize the specified device. In case of success return 0; otherwise
// return an error code.
int32_t __tgt_rtl_init_device(int32_t ID);
// Pass an executable image section described by image to the specified
// device and prepare an address table of target entities. In case of error,
// return NULL. Otherwise, return a pointer to the built address table.
// Individual entries in the table may also be NULL, when the corresponding
// offload region is not supported on the target device.
__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
__tgt_device_image *Image);
// Allocate data on the particular target device, of the specified size.
// HostPtr is a address of the host data the allocated target data
// will be associated with (HostPtr may be NULL if it is not known at
// allocation time, like for example it would be for target data that
// is allocated by omp_target_alloc() API). Return address of the
// allocated data on the target that will be used by libomptarget.so to
// initialize the target data mapping structures. These addresses are
// used to generate a table of target variables to pass to
// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
// case an error occurred on the target device.
void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
// Pass the data content to the target device using the target address.
// In case of success, return zero. Otherwise, return an error code.
int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
int64_t Size);
// Retrieve the data content from the target device using its address.
// In case of success, return zero. Otherwise, return an error code.
int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
int64_t Size);
// De-allocate the data referenced by target ptr on the device. In case of
// success, return zero. Otherwise, return an error code.
int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
// Transfer control to the offloaded entry Entry on the target device.
// Args and Offsets are arrays of NumArgs size of target addresses and
// offsets. An offset should be added to the target address before passing it
// to the outlined function on device side. In case of success, return zero.
// Otherwise, return an error code.
int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
ptrdiff_t *Offsets, int32_t NumArgs);
// Similar to __tgt_rtl_run_target_region, but additionally specify the
// number of teams to be created and a number of threads in each team.
int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
ptrdiff_t *Offsets, int32_t NumArgs,
int32_t NumTeams, int32_t ThreadLimit,
uint64_t loop_tripcount);
#ifdef __cplusplus
}
#endif
#endif // _OMPTARGETPLUGIN_H_