[OpenMP] Completely remove old device runtime

This patch completely removes the old OpenMP device runtime. Previously,
the old runtime had the prefix `libomptarget-new-` and the old runtime
was simply called `libomptarget-`. This patch makes the formerly new
runtime the only runtime available. The entire project has been deleted,
and all references to the `libomptarget-new` runtime has been replaced
with `libomptarget-`.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D118934
This commit is contained in:
Joseph Huber 2022-02-03 14:43:40 -05:00
parent 0cc6165d05
commit 034adaf5be
82 changed files with 38 additions and 8084 deletions

View File

@ -1203,8 +1203,7 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
llvm_unreachable("OpenMP can only handle device code.");
llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
if (CGM.getLangOpts().OpenMPTargetNewRuntime &&
!CGM.getLangOpts().OMPHostIRFile.empty()) {
if (!CGM.getLangOpts().OMPHostIRFile.empty()) {
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
"__omp_rtl_debug_kind");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,

View File

@ -290,11 +290,7 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
return;
std::string BitcodeSuffix;
if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime, true))
BitcodeSuffix = "new-amdgpu-" + GPUArch;
else
BitcodeSuffix = "amdgcn-" + GPUArch;
BitcodeSuffix = "amdgcn-" + GPUArch;
addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
getTriple());

View File

@ -5936,13 +5936,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_openmp_cuda_mode, /*Default=*/false))
CmdArgs.push_back("-fopenmp-cuda-mode");
// When in OpenMP offloading mode, enable or disable the new device
// runtime.
if (Args.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime,
/*Default=*/true))
CmdArgs.push_back("-fopenmp-target-new-runtime");
// When in OpenMP offloading mode, enable debugging on the device.
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_target_debug_EQ);
if (Args.hasFlag(options::OPT_fopenmp_target_debug,
@ -8187,9 +8180,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ);
std::string BitcodeSuffix;
if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime, true))
BitcodeSuffix += "new-";
if (TC->getTriple().isNVPTX())
BitcodeSuffix += "nvptx-";
else if (TC->getTriple().isAMDGPU())

View File

@ -749,11 +749,7 @@ void CudaToolChain::addClangTargetOptions(
return;
std::string BitcodeSuffix;
if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime, true))
BitcodeSuffix = "new-nvptx-" + GpuArch.str();
else
BitcodeSuffix = "nvptx-" + GpuArch.str();
BitcodeSuffix = "nvptx-" + GpuArch.str();
addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
getTriple());

View File

@ -3484,9 +3484,6 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
GenerateArg(Args, OPT_fopenmp_version_EQ, Twine(Opts.OpenMP), SA);
}
if (Opts.OpenMPTargetNewRuntime)
GenerateArg(Args, OPT_fopenmp_target_new_runtime, SA);
if (Opts.OpenMPThreadSubscription)
GenerateArg(Args, OPT_fopenmp_assume_threads_oversubscription, SA);
@ -3877,9 +3874,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
Opts.OpenMP && Args.hasArg(options::OPT_fopenmp_enable_irbuilder);
bool IsTargetSpecified =
Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ);
Opts.OpenMPTargetNewRuntime =
Opts.OpenMPIsDevice &&
Args.hasArg(options::OPT_fopenmp_target_new_runtime);
Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice;
@ -3927,17 +3921,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
// Set either by a specific value or to a default if not specified.
if (Opts.OpenMPIsDevice && (Args.hasArg(OPT_fopenmp_target_debug) ||
Args.hasArg(OPT_fopenmp_target_debug_EQ))) {
if (Opts.OpenMPTargetNewRuntime) {
Opts.OpenMPTargetDebug = getLastArgIntValue(
Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
Opts.OpenMPTargetDebug = 1;
} else {
Diags.Report(diag::err_drv_debug_no_new_runtime);
}
Opts.OpenMPTargetDebug = getLastArgIntValue(
Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
Opts.OpenMPTargetDebug = 1;
}
if (Opts.OpenMPIsDevice && Opts.OpenMPTargetNewRuntime) {
if (Opts.OpenMPIsDevice) {
if (Args.hasArg(OPT_fopenmp_assume_teams_oversubscription))
Opts.OpenMPTeamSubscription = true;
if (Args.hasArg(OPT_fopenmp_assume_threads_oversubscription))

View File

@ -1,6 +1,6 @@
// REQUIRES: x86-registered-target
// REQUIRES: amdgpu-registered-target
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
// RUN: | FileCheck %s
// verify the tools invocations
@ -14,7 +14,7 @@
// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc"
// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget"
// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
// RUN: | FileCheck --check-prefix=CHECK-PHASES %s
// phases
// CHECK-PHASES: 0: input, "{{.*}}amdgpu-openmp-toolchain.c", c, (host-openmp)
@ -36,13 +36,13 @@
// CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp)
// handling of --libomptarget-amdgcn-bc-path
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
// CHECK-LIBOMPTARGET: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc"{{.*}}
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
// CHECK-NOGPULIB-NOT: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}libomptarget-amdgcn-gfx803.bc"{{.*}}
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"],
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang",{{.*}} output: "[[HOST_BC:.*]]"
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]"], output: "[[HOST_S:.*]]"
@ -56,13 +56,13 @@
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[HOST_O]]", "[[OFFLOAD_O]]"], output:
// verify the llc is invoked for textual assembly output
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=CHECK-SAVE-ASM
// CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=asm" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.s"
// CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.o"
// check the handling of -c
// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=CHECK-C
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang",
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang",{{.*}}output: "[[HOST_BC:.*]]"
@ -72,8 +72,8 @@
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang::as"
// CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler"
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
// CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"
// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
// CHECK-LIB-DEVICE: {{.*}}llvm-link{{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"

View File

@ -155,43 +155,24 @@
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB %s
/// Specify the directory containing the bitcode lib, check clang picks the right one
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
// RUN: -fopenmp-relocatable-target -save-temps \
// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-DIR %s
/// Check with the new runtime enabled
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \
// RUN: -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW %s
/// Check with new runtime and specifying the directory
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget -save-temps \
// RUN: -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW-DIR %s
/// Create a bogus bitcode library and find it with LIBRARY_PATH
// RUN: env LIBRARY_PATH=%S/Inputs/libomptarget/subdir %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
// RUN: -fopenmp-relocatable-target -save-temps \
// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s
// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_35.bc
// CHK-BCLIB-NEW: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-new-nvptx-test.bc
// CHK-BCLIB-NEW-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-new-nvptx-sm_35.bc
// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_35.bc
// CHK-BCLIB-NOT: {{error:|warning:}}
@ -204,7 +185,7 @@
// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB-WARN %s
// CHK-BCLIB-WARN: no library 'libomptarget-new-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library
// CHK-BCLIB-WARN: no library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library
/// ###########################################################################

View File

@ -1,12 +1,12 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex "__omp_rtl_"
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// expected-no-diagnostics
#ifndef HEADER

View File

@ -38,13 +38,11 @@ endif()
# This is a list of all the targets that are supported/tested right now.
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newRTL")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newDriver")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newRTL")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newDriver")
# Once the plugins for the different targets are validated, they will be added to
@ -81,7 +79,6 @@ set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${LIBOMP_LIBRARY_DIR}" CACHE STRING
# Build offloading plugins and device RTLs if they are available.
add_subdirectory(plugins)
add_subdirectory(deviceRTLs)
add_subdirectory(DeviceRTL)
add_subdirectory(tools)

View File

@ -180,7 +180,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)
list(APPEND bc_files ${outfile})
endforeach()
set(bclib_name "libomptarget-new-${target_name}-${target_cpu}.bc")
set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
@ -212,7 +212,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
set(bclib_target_name "omptarget-new-${target_name}-${target_cpu}-bc")
set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})

View File

@ -1,14 +0,0 @@
##===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ##===----------------------------------------------------------------------===##
#
# Build a device RTL for each available machine.
#
##===----------------------------------------------------------------------===##
add_subdirectory(amdgcn)
add_subdirectory(nvptx)

View File

@ -1,193 +0,0 @@
##===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##
#
# Build the AMDGCN Device RTL bitcode library using clang -ffreestanding
#
##===----------------------------------------------------------------------===##
set(LIBOMPTARGET_BUILD_AMDGCN_BCLIB FALSE CACHE BOOL
"Can be set to true to enable building this library.")
if (NOT LIBOMPTARGET_BUILD_AMDGCN_BCLIB)
libomptarget_say("Not building AMDGCN device RTL: Disabled by LIBOMPTARGET_BUILD_AMDGCN_BCLIB")
return()
endif()
if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
return()
endif()
# Copied from nvptx CMakeLists
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
set(aux_triple x86_64-unknown-linux-gnu)
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
set(aux_triple powerpc64le-unknown-linux-gnu)
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
set(aux_triple aarch64-unknown-linux-gnu)
else()
libomptarget_say("Not building AMDGCN device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
return()
endif()
if (LLVM_DIR)
# Builds that use pre-installed LLVM have LLVM_DIR set.
find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
find_program(LINK_TOOL llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
find_program(OPT_TOOL opt PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
if ((NOT CLANG_TOOL) OR (NOT LINK_TOOL) OR (NOT OPT_TOOL))
libomptarget_say("Not building AMDGCN device RTL. Missing clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} or opt: ${OPT_TOOL}")
return()
else()
libomptarget_say("Building AMDGCN device RTL. Using clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} and opt: ${OPT_TOOL}")
endif()
elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
# LLVM in-tree builds may use CMake target names to discover the tools.
set(CLANG_TOOL $<TARGET_FILE:clang>)
set(LINK_TOOL $<TARGET_FILE:llvm-link>)
set(OPT_TOOL $<TARGET_FILE:opt>)
libomptarget_say("Building AMDGCN device RTL. Using clang from in-tree build")
else()
libomptarget_say("Not building AMDGCN device RTL. No appropriate clang found")
return()
endif()
project(omptarget-amdgcn)
add_custom_target(omptarget-amdgcn ALL)
#optimization level
set(optimization_level 2)
# Activate RTL message dumps if requested by the user.
if(LIBOMPTARGET_NVPTX_DEBUG)
set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g)
endif()
get_filename_component(devicertl_base_directory
${CMAKE_CURRENT_SOURCE_DIR}
DIRECTORY)
set(cuda_sources
${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip
${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
${devicertl_base_directory}/common/src/cancel.cu
${devicertl_base_directory}/common/src/critical.cu
${devicertl_base_directory}/common/src/data_sharing.cu
${devicertl_base_directory}/common/src/libcall.cu
${devicertl_base_directory}/common/src/loop.cu
${devicertl_base_directory}/common/src/omp_data.cu
${devicertl_base_directory}/common/src/omptarget.cu
${devicertl_base_directory}/common/src/parallel.cu
${devicertl_base_directory}/common/src/reduction.cu
${devicertl_base_directory}/common/src/support.cu
${devicertl_base_directory}/common/src/shuffle.cpp
${devicertl_base_directory}/common/src/sync.cu
${devicertl_base_directory}/common/src/task.cu)
set(h_files
${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
${devicertl_base_directory}/common/debug.h
${devicertl_base_directory}/common/omptarget.h
${devicertl_base_directory}/common/omptargeti.h
${devicertl_base_directory}/common/state-queue.h
${devicertl_base_directory}/common/state-queuei.h
${devicertl_base_directory}/common/support.h)
# for both in-tree and out-of-tree build
if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
else()
set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
endif()
# create gfx bitcode libraries
set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx90a gfx1010 gfx1030 gfx1031)
if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
endif()
# Prepend -I to each list element
set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I")
macro(add_cuda_bc_library)
set(cu_cmd ${CLANG_TOOL}
-xc++
-c
-mllvm -openmp-opt-disable
-std=c++14
-ffreestanding
-target amdgcn-amd-amdhsa
-emit-llvm
-Xclang -aux-triple -Xclang ${aux_triple}
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-D__AMDGCN__
-Xclang -target-cpu -Xclang ${mcpu}
-fvisibility=hidden
-Wno-unused-value
-nogpulib
-O${optimization_level}
${CUDA_DEBUG}
-I${CMAKE_CURRENT_SOURCE_DIR}/src
-I${devicertl_base_directory}/common/include
-I${devicertl_base_directory}
-I${devicertl_base_directory}/../include
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN})
set(bc1_files)
foreach(file ${ARGN})
get_filename_component(fname ${file} NAME_WE)
set(bc1_filename ${fname}.${mcpu}.bc)
add_custom_command(
OUTPUT ${bc1_filename}
COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
DEPENDS ${file} ${h_files})
list(APPEND bc1_files ${bc1_filename})
endforeach()
add_custom_command(
OUTPUT linkout.cuda.${mcpu}.bc
COMMAND ${LINK_TOOL} ${bc1_files} -o linkout.cuda.${mcpu}.bc
DEPENDS ${bc1_files})
list(APPEND bc_files linkout.cuda.${mcpu}.bc)
endmacro()
set(libname "omptarget-amdgcn")
set(toolchain_deps "")
if(TARGET llvm-link)
list(APPEND toolchain_deps llvm-link)
endif()
if(TARGET opt)
list(APPEND toolchain_deps opt)
endif()
foreach(mcpu ${mcpus})
set(bc_files)
add_cuda_bc_library(${cuda_sources})
set(bc_libname lib${libname}-${mcpu}.bc)
add_custom_command(
OUTPUT ${bc_libname}
COMMAND ${LINK_TOOL} ${bc_files} | ${OPT_TOOL} --always-inline -o ${OUTPUTDIR}/${bc_libname}
DEPENDS ${bc_files} ${toolchain_deps})
add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
install(FILES ${OUTPUTDIR}/${bc_libname}
DESTINATION "${OPENMP_INSTALL_LIBDIR}"
)
endforeach()

View File

@ -1,19 +0,0 @@
//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _AMDGCN_INTERFACE_H_
#define _AMDGCN_INTERFACE_H_
#include <stdint.h>
#define EXTERN extern "C"
typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
#endif

View File

@ -1,34 +0,0 @@
//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
// cannot be implemented - if one thread gets the lock, it can't continue on to
// the next instruction in order to do anything as the other threads are waiting
// to take the lock.
// These functions will be implemented to provide the documented semantics for
// a SIMD => wavefront mapping once that is implemented.
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/debug.h"
static void warn() {
PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
}
void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
int __kmpc_impl_test_lock(omp_lock_t *lock) {
warn();
return 0;
}
#pragma omp end declare target

View File

@ -1,64 +0,0 @@
//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "target_impl.h"
// Partially derived fom hcc_detail/device_functions.h
// HW_ID Register bit structure
// WAVE_ID 3:0 Wave buffer slot number. 0-9.
// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
// PIPE_ID 7:6 Pipeline from which the wave was dispatched.
// CU_ID 11:8 Compute Unit the wave is assigned to.
// SH_ID 12 Shader Array (within an SE) the wave is assigned to.
// SE_ID 14:13 Shader Engine the wave is assigned to.
// TG_ID 19:16 Thread-group ID
// VM_ID 23:20 Virtual Memory ID
// QUEUE_ID 26:24 Queue from which this wave was dispatched.
// STATE_ID 29:27 State ID (graphics only, not compute).
// ME_ID 31:30 Micro-engine ID.
enum {
HW_ID = 4, // specify that the hardware register to read is HW_ID
HW_ID_CU_ID_SIZE = 4, // size of CU_ID field in bits
HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
HW_ID_SE_ID_SIZE = 2, // sizeof SE_ID field in bits
HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
};
// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
// immediate and returns a 32 bit value.
// The encoding of the immediate parameter is:
// ID 5:0 Which register to read from
// OFFSET 10:6 Range: 0..31
// WIDTH 15:11 Range: 1..32
// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
// where hwreg forms a 16 bit immediate encoded by the assembler thus:
// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
// return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
// }
#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
// Note: The results can be changed by a context switch
// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
// bound on how many compute units are available. Some values in this
// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
EXTERN uint32_t __kmpc_impl_smid() {
uint32_t cu_id = __builtin_amdgcn_s_getreg(
ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
uint32_t se_id = __builtin_amdgcn_s_getreg(
ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
}
#pragma omp end declare target

View File

@ -1,83 +0,0 @@
//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Declarations and definitions of target specific functions and constants
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
#define OMPTARGET_AMDGCN_TARGET_IMPL_H
#ifndef __AMDGCN__
#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
#endif
#include "amdgcn_interface.h"
#include <stddef.h>
#include <stdint.h>
// subset of inttypes.h
#define PRId64 "ld"
#define PRIu64 "lu"
typedef uint64_t __kmpc_impl_lanemask_t;
#define INLINE inline
#define NOINLINE __attribute__((noinline))
#define ALIGN(N) __attribute__((aligned(N)))
#define PLUGIN_ACCESSIBLE \
__attribute__((used)) /* Don't discard values the plugin reads */ \
__attribute__((weak)) /* We may have multiple definitions */ \
__attribute__((retain)) /* Also needed to keep values alive */ \
__attribute__((visibility("protected"))) /* Access via SHT_HASH */ \
__attribute__((section(".data"))) /* Not .bss, can write before load */
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
INLINE constexpr const llvm::omp::GV &getGridValue() {
return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
}
////////////////////////////////////////////////////////////////////////////////
// Kernel options
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// The following def must match the absolute limit hardwired in the host RTL
// max number of threads per team
enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
enum { WARPSIZE = getGridValue().GV_Warp_Size };
// Maximum number of omp state objects per SM allocated statically in global
// memory.
#define OMP_STATE_COUNT 32
#define MAX_SM 64
#define OMP_ACTIVE_PARALLEL_LEVEL 128
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The size reserved for data in a shared memory slot.
DS_Slot_Size = getGridValue().GV_Slot_Size,
// The slot size that should be reserved for a working warp.
DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
// The maximum number of warps in use
DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
};
enum : __kmpc_impl_lanemask_t {
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
};
// The return code of printf is not checked in the call sites in this library.
// A call to a function named printf currently hits some special case handling
// for opencl, which translates to calls that do not presently exist for openmp
// Therefore, for now, stub out printf while building this library.
#define printf(...)
#endif

View File

@ -1,226 +0,0 @@
//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Definitions of target specific functions
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "target_impl.h"
#include "target_interface.h"
// Implementations initially derived from hcc
// Initialized with a 64-bit mask with bits set in positions less than the
// thread's lane number in the warp
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
uint32_t lane = GetLaneId();
int64_t ballot = __kmpc_impl_activemask();
uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
return mask & ballot;
}
// Initialized with a 64-bit mask with bits set in positions greater than the
// thread's lane number in the warp
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
uint32_t lane = GetLaneId();
if (lane == (WARPSIZE - 1))
return 0;
uint64_t ballot = __kmpc_impl_activemask();
uint64_t mask = (~((uint64_t)0)) << (lane + 1);
return mask & ballot;
}
EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); }
EXTERN double __kmpc_impl_get_wtime() {
// The intrinsics for measuring time have undocumented frequency
// This will probably need to be found by measurement on a number of
// architectures. Until then, return 0, which is very inaccurate as a
// timer but resolves the undefined symbol at link time.
return 0;
}
// Warp vote function
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
return __builtin_amdgcn_read_exec();
}
static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
__atomic_thread_fence(__ATOMIC_ACQUIRE);
uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;
// Partial barrier implementation for amdgcn.
// Uses two 16 bit unsigned counters. One for the number of waves to have
// reached the barrier, and one to count how many times the barrier has been
// passed. These are packed in a single atomically accessed 32 bit integer.
// Low bits for the number of waves, assumed zero before this call.
// High bits to count the number of times the barrier has been passed.
// precondition: num_waves != 0;
// invariant: num_waves * WARPSIZE == num_threads;
// precondition: num_waves < 0xffffu;
// Increment the low 16 bits once, using the lowest active thread.
uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
bool isLowest = GetLaneId() == lowestActiveThread;
if (isLowest) {
uint32_t load = __atomic_fetch_add(barrier_state, 1,
__ATOMIC_RELAXED); // commutative
// Record the number of times the barrier has been passed
uint32_t generation = load & 0xffff0000u;
if ((load & 0x0000ffffu) == (num_waves - 1)) {
// Reached num_waves in low bits so this is the last wave.
// Set low bits to zero and increment high bits
load += 0x00010000u; // wrap is safe
load &= 0xffff0000u; // because bits zeroed second
// Reset the wave counter and release the waiting waves
__atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
} else {
// more waves still to go, spin until generation counter changes
do {
__builtin_amdgcn_s_sleep(0);
load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
} while ((load & 0xffff0000u) == generation);
}
}
__atomic_thread_fence(__ATOMIC_RELEASE);
}
uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
EXTERN void __kmpc_impl_target_init() {
// Don't have global ctors, and shared memory is not zero init
__atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
}
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
}
namespace {
uint32_t get_grid_dim(uint32_t n, uint16_t d) {
uint32_t q = n / d;
return q + (n > q * d);
}
uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
uint16_t group_size) {
uint32_t r = grid_size - group_id * group_size;
return (r < group_size) ? r : group_size;
}
} // namespace
EXTERN int __kmpc_get_hardware_num_blocks() {
return get_grid_dim(__builtin_amdgcn_grid_size_x(),
__builtin_amdgcn_workgroup_size_x());
}
EXTERN int __kmpc_get_hardware_num_threads_in_block() {
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
__builtin_amdgcn_grid_size_x(),
__builtin_amdgcn_workgroup_size_x());
}
EXTERN unsigned __kmpc_get_warp_size() {
return WARPSIZE;
}
EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
EXTERN unsigned GetLaneId() {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}
EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
return __kmpc_get_hardware_num_threads_in_block();
}
// Atomics
uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
}
uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, "");
}
uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
}
uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
uint32_t R;
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
return R;
}
uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
(void)__atomic_compare_exchange(Address, &Compare, &Val, false,
__ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
return Compare;
}
unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
unsigned long long Val) {
unsigned long long R;
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
return R;
}
unsigned long long __kmpc_atomic_add(unsigned long long *Address,
unsigned long long Val) {
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
}
// Stub implementations
// Weak to allow overriding by local versions while comparing different
// potential implementations
__attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
return nullptr;
}
__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
EXTERN
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
return -1;
}
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
}
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
return (((uint64_t)hi) << 32) | (uint64_t)lo;
}
EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
// AMDGCN doesn't need to sync threads in a warp
}
EXTERN void __kmpc_impl_threadfence() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
}
EXTERN void __kmpc_impl_threadfence_block() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
}
EXTERN void __kmpc_impl_threadfence_system() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
}
// Calls to the AMDGCN layer (assuming 1D layout)
EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
#pragma omp end declare target

View File

@ -1,44 +0,0 @@
//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Macros for allocating variables in different address spaces.
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_ALLOCATOR_H
#define OMPTARGET_ALLOCATOR_H
#if _OPENMP
// Follows the pattern in interface.h
// Clang sema checks this type carefully, needs to closely match that from omp.h
typedef enum omp_allocator_handle_t {
omp_null_allocator = 0,
omp_default_mem_alloc = 1,
omp_large_cap_mem_alloc = 2,
omp_const_mem_alloc = 3,
omp_high_bw_mem_alloc = 4,
omp_low_lat_mem_alloc = 5,
omp_cgroup_mem_alloc = 6,
omp_pteam_mem_alloc = 7,
omp_thread_mem_alloc = 8,
KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
} omp_allocator_handle_t;
#define __PRAGMA(STR) _Pragma(#STR)
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
#define SHARED(NAME) \
NAME [[clang::loader_uninitialized]]; \
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
#define EXTERN_SHARED(NAME) \
NAME; \
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
#endif
#endif // OMPTARGET_ALLOCATOR_H

View File

@ -1,293 +0,0 @@
//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains debug macros to be used in the application.
//
// Usage guide
//
// PRINT0(flag, str) : if debug flag is on, print (no arguments)
// PRINT(flag, str, args) : if debug flag is on, print (arguments)
// DON(flag) : return true if debug flag is on
//
// ASSERT(flag, cond, str, args): if test flag is on, test the condition
// if the condition is false, print str+args
// and assert.
// CAUTION: cond may be evaluate twice
// AON(flag) : return true if test flag is on
//
// WARNING(flag, str, args) : if warning flag is on, print the warning
// WON(flag) : return true if warning flag is on
//
//===----------------------------------------------------------------------===//
#ifndef _OMPTARGET_NVPTX_DEBUG_H_
#define _OMPTARGET_NVPTX_DEBUG_H_
#include "target_interface.h"
////////////////////////////////////////////////////////////////////////////////
// set desired level of debugging
////////////////////////////////////////////////////////////////////////////////
#define LD_SET_NONE 0ULL /* none */
#define LD_SET_ALL -1ULL /* all */
// pos 1
#define LD_SET_LOOP 0x1ULL /* basic loop */
#define LD_SET_LOOPD 0x2ULL /* basic loop */
#define LD_SET_PAR 0x4ULL /* basic parallel */
#define LD_SET_PARD 0x8ULL /* basic parallel */
// pos 2
#define LD_SET_SYNC 0x10ULL /* sync info */
#define LD_SET_SYNCD 0x20ULL /* sync info */
#define LD_SET_WAIT 0x40ULL /* state when waiting */
#define LD_SET_TASK 0x80ULL /* print task info (high level) */
// pos 3
#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */
#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */
#define LD_SET_ENV 0x400ULL /* env info */
#define LD_SET_CANCEL 0x800ULL /* print cancel info */
// pos 4
#define LD_SET_MEM 0x1000ULL /* malloc / free */
////////////////////////////////////////////////////////////////////////////////
// set the desired flags to print selected output.
// these are some examples of possible definitions that can be used for
// debugging.
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
// on cuda buffer
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
#ifndef OMPTARGET_NVPTX_DEBUG
#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
#elif OMPTARGET_NVPTX_DEBUG
#warning debug is used, not good for measurements
#endif
////////////////////////////////////////////////////////////////////////////////
// set desired level of asserts
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// available flags
#define LT_SET_NONE 0x0 /* unsafe */
#define LT_SET_SAFETY \
0x1 /* check malloc type of stuff, input at creation, cheap */
#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
////////////////////////////////////////////////////////////////////////////////
// set the desired flags
#ifndef OMPTARGET_NVPTX_TEST
#if OMPTARGET_NVPTX_DEBUG
#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
#else
#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
#endif
#endif
////////////////////////////////////////////////////////////////////////////////
// set desired level of warnings
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// available flags
#define LW_SET_ALL -1
#define LW_SET_NONE 0x0
#define LW_SET_ENV 0x1
#define LW_SET_INPUT 0x2
#define LW_SET_FUSSY 0x4
////////////////////////////////////////////////////////////////////////////////
// set the desired flags
#if OMPTARGET_NVPTX_DEBUG
#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
#else
#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
#endif
////////////////////////////////////////////////////////////////////////////////
// implementation for debug
////////////////////////////////////////////////////////////////////////////////
#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
#include "common/support.h"
template <typename... Arguments>
NOINLINE static void log(const char *fmt, Arguments... parameters) {
printf(fmt, (int)GetBlockIdInKernel(),
(int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
(int)GetLaneId(), parameters...);
}
#endif
#if OMPTARGET_NVPTX_TEST
template <typename... Arguments>
NOINLINE static void check(bool cond, const char *fmt,
Arguments... parameters) {
if (!cond) {
printf(fmt, (int)GetBlockIdInKernel(),
(int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
(int)GetLaneId(), parameters...);
__builtin_trap();
}
}
NOINLINE static void check(bool cond) {
if (!cond)
__builtin_trap();
}
#endif
// set flags that are tested (inclusion properties)
#define LD_ALL (LD_SET_ALL)
#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
#define LD_LOOPD (LD_SET_LOOPD)
#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
#define LD_PARD (LD_SET_PARD)
// pos 2
#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
#define LD_SYNCD (LD_SET_SYNCD)
#define LD_WAIT (LD_SET_WAIT)
#define LD_TASK (LD_SET_TASK)
// pos 3
#define LD_IO (LD_SET_IO | LD_SET_IOD)
#define LD_IOD (LD_SET_IOD)
#define LD_ENV (LD_SET_ENV)
#define LD_CANCEL (LD_SET_CANCEL)
// pos 3
#define LD_MEM (LD_SET_MEM)
// implement
#if OMPTARGET_NVPTX_DEBUG
#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
#define PRINT0(_flag, _str) \
{ \
if (omptarget_device_environment.debug_level && DON(_flag)) { \
log("<b %2d, t %4d, w %2d, l %2d>: " _str); \
} \
}
#define PRINT(_flag, _str, _args...) \
{ \
if (omptarget_device_environment.debug_level && DON(_flag)) { \
log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args); \
} \
}
#else
#define DON(_flag) (0)
#define PRINT0(flag, str)
#define PRINT(flag, str, _args...)
#endif
// for printing without worrying about precision, pointers...
#define P64(_x) ((unsigned long long)(_x))
////////////////////////////////////////////////////////////////////////////////
// early defs for test
////////////////////////////////////////////////////////////////////////////////
#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
#define LT_FUSSY (LT_SET_FUSSY)
#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
#define ASSERT0(_flag, _cond, _str) \
{ \
if (TON(_flag)) { \
check(_cond); \
} \
}
#define ASSERT(_flag, _cond, _str, _args...) \
{ \
if (TON(_flag)) { \
check(_cond); \
} \
}
#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
#define ASSERT0(_flag, _cond, _str) \
{ \
if (TON(_flag)) { \
check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n"); \
} \
}
#define ASSERT(_flag, _cond, _str, _args...) \
{ \
if (TON(_flag)) { \
check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", \
_args); \
} \
}
#else
#define TON(_flag) (0)
#define ASSERT0(_flag, _cond, _str)
#define ASSERT(_flag, _cond, _str, _args...)
#endif
////////////////////////////////////////////////////////////////////////////////
// early defs for warning
#define LW_ALL (LW_SET_ALL)
#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
#define LW_FUSSY (LW_SET_FUSSY)
#if OMPTARGET_NVPTX_WARNING
#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
#define WARNING0(_flag, _str) \
{ \
if (WON(_flag)) { \
log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str); \
} \
}
#define WARNING(_flag, _str, _args...) \
{ \
if (WON(_flag)) { \
log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args); \
} \
}
#else
#define WON(_flag) (0)
#define WARNING0(_flag, _str)
#define WARNING(_flag, _str, _args...)
#endif
#endif

View File

@ -1,405 +0,0 @@
case 0:
((void (*)(kmp_int32 *, kmp_int32 *
))fn)(&global_tid, &bound_tid
);
break;
case 1:
((void (*)(kmp_int32 *, kmp_int32 *
, void *))fn)(&global_tid, &bound_tid
, args[0]);
break;
case 2:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1]);
break;
case 3:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2]);
break;
case 4:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
);
break;
case 5:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4]);
break;
case 6:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5]);
break;
case 7:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6]);
break;
case 8:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
);
break;
case 9:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8]);
break;
case 10:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9]);
break;
case 11:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10]);
break;
case 12:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
);
break;
case 13:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12]);
break;
case 14:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13]);
break;
case 15:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14]);
break;
case 16:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
);
break;
case 17:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16]);
break;
case 18:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17]);
break;
case 19:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18]);
break;
case 20:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
);
break;
case 21:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20]);
break;
case 22:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21]);
break;
case 23:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22]);
break;
case 24:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
);
break;
case 25:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24]);
break;
case 26:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24], args[25]);
break;
case 27:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24], args[25], args[26]);
break;
case 28:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24], args[25], args[26], args[27]
);
break;
case 29:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24], args[25], args[26], args[27]
, args[28]);
break;
case 30:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24], args[25], args[26], args[27]
, args[28], args[29]);
break;
case 31:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24], args[25], args[26], args[27]
, args[28], args[29], args[30]);
break;
case 32:
((void (*)(kmp_int32 *, kmp_int32 *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
, void *, void *, void *, void *
))fn)(&global_tid, &bound_tid
, args[0], args[1], args[2], args[3]
, args[4], args[5], args[6], args[7]
, args[8], args[9], args[10], args[11]
, args[12], args[13], args[14], args[15]
, args[16], args[17], args[18], args[19]
, args[20], args[21], args[22], args[23]
, args[24], args[25], args[26], args[27]
, args[28], args[29], args[30], args[31]
);
break;

View File

@ -1,94 +0,0 @@
//===-- target.h ---------- OpenMP device runtime target implementation ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Target region interfaces are simple interfaces designed to allow middle-end
// (=LLVM) passes to analyze and transform the code. To achieve good performance
// it may be required to run the associated passes. However, implementations of
// this interface shall always provide a correct implementation as close to the
// user expected code as possible.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
#define LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
#include <stdint.h>
extern "C" {
/// Forward declaration of the source location identifier "ident".
typedef struct ident ident_t;
/// The target region _kernel_ interface for GPUs
///
/// This deliberatly simple interface provides the middle-end (=LLVM) with
/// easier means to reason about the semantic of the code and transform it as
/// well. The runtime calls are therefore also desiged to carry sufficient
/// information necessary for optimizations.
///
///
/// Intended usage:
///
/// \code
/// void kernel(...) {
/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
/// /* UseGenericStateMachine */ true,
/// /* RequiresFullRuntime */ ... );
/// if (ThreadKind == -1) {
/// // User defined kernel code.
/// }
/// __kmpc_target_deinit(...);
/// }
/// \endcode
///
/// Which can be transformed to:
///
/// \code
/// void kernel(...) {
/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
/// /* UseGenericStateMachine */ false,
/// /* RequiresFullRuntime */ ... );
/// if (ThreadKind == -1) {
/// // User defined kernel code.
/// } else {
/// assume(ThreadKind == ThreadId);
/// // Custom, kernel-specific state machine code.
/// }
/// __kmpc_target_deinit(...);
/// }
/// \endcode
///
///
///{
/// Initialization
///
/// Must be called by all threads.
///
/// \param Ident Source location identification, can be NULL.
///
int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
bool UseGenericStateMachine,
bool RequiresFullRuntime);
/// De-Initialization
///
/// Must be called by the main thread in generic mode, can be called by all
/// threads. Must be called by all threads in SPMD mode.
///
/// In non-SPMD, this function releases the workers trapped in a state machine
/// and also any memory dynamically allocated by the runtime.
///
/// \param Ident Source location identification, can be NULL.
///
void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
bool RequiresFullRuntime);
///}
}
#endif

View File

@ -1,102 +0,0 @@
//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Shuffle function implementations for all supported targets.
//
// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
//
//===----------------------------------------------------------------------===//
#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
#include <stdint.h>
#pragma omp declare target
/// External shuffle API
///
///{
extern "C" {
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
}
///}
/// Forward declarations
///
///{
extern "C" {
unsigned GetLaneId();
unsigned __kmpc_get_warp_size();
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
}
///}
/// Fallback implementations of the shuffle sync idiom.
/// Unavailable at present (would error at link time if used).
///
///{
int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
int32_t Width);
///}
/// AMDGCN implementations of the shuffle sync idiom.
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
int32_t SrcLane) {
int Width = __kmpc_get_warp_size();
int Self = GetLaneId();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
uint32_t LaneDelta, int32_t Width) {
int Self = GetLaneId();
int Index = Self + LaneDelta;
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
#pragma omp end declare variant
///}
/// NVPTX implementations of the shuffle and shuffle sync idiom.
///
///{
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
int32_t SrcLane) {
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
}
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
uint32_t Delta, int32_t Width) {
int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}
#pragma omp end declare variant
///}
#pragma omp end declare target
#endif

View File

@ -1,282 +0,0 @@
//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the declarations of all library macros, types,
// and functions.
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_H
#define OMPTARGET_H
#include "common/allocator.h"
#include "common/debug.h" // debug
#include "common/state-queue.h"
#include "common/support.h"
#include "interface.h" // interfaces with omp, compiler, and user
#include "target_impl.h"
#define OMPTARGET_NVPTX_VERSION 1.1
// used by the library for the interface with the app
#define DISPATCH_FINISHED 0
#define DISPATCH_NOTFINISHED 1
// used by dynamic scheduling
#define FINISHED 0
#define NOT_FINISHED 1
#define LAST_CHUNK 2
#define BARRIER_COUNTER 0
#define ORDERED_COUNTER 1
// Worker slot type which is initialized with the default worker slot
// size of 4*32 bytes.
struct __kmpc_data_sharing_slot {
__kmpc_data_sharing_slot *Next;
__kmpc_data_sharing_slot *Prev;
void *PrevSlotStackPtr;
void *DataEnd;
char Data[DS_Worker_Warp_Slot_Size];
};
////////////////////////////////////////////////////////////////////////////////
// task ICV and (implicit & explicit) task state
class omptarget_nvptx_TaskDescr {
public:
// methods for flags
INLINE omp_sched_t GetRuntimeSched() const;
INLINE void SetRuntimeSched(omp_sched_t sched);
INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
INLINE int InL2OrHigherParallelRegion() const {
return items.flags & TaskDescr_InParL2P;
}
INLINE int IsParallelConstruct() const {
return items.flags & TaskDescr_IsParConstr;
}
INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
// methods for other fields
INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
prev = taskDescr;
}
// init & copy
INLINE void InitLevelZeroTaskDescr();
INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
uint16_t tid, uint16_t tnum);
INLINE void SaveLoopData();
INLINE void RestoreLoopData() const;
private:
// bits for flags: (6 used, 2 free)
// 3 bits (SchedMask) for runtime schedule
// 1 bit (InPar) if this thread has encountered one or more parallel region
// 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
// 1 bit (InParL2+) if this thread has encountered L2 or higher parallel
// region
static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
static const uint8_t TaskDescr_InPar = 0x10;
static const uint8_t TaskDescr_IsParConstr = 0x20;
static const uint8_t TaskDescr_InParL2P = 0x40;
struct SavedLoopDescr_items {
int64_t loopUpperBound;
int64_t nextLowerBound;
int64_t chunk;
int64_t stride;
kmp_sched_t schedule;
} loopData;
struct TaskDescr_items {
uint8_t flags; // 6 bit used (see flag above)
uint8_t unused;
uint16_t threadId; // thread id
uint64_t runtimeChunkSize; // runtime chunk size
} items;
omptarget_nvptx_TaskDescr *prev;
};
// build on kmp
typedef struct omptarget_nvptx_ExplicitTaskDescr {
omptarget_nvptx_TaskDescr
taskDescr; // omptarget_nvptx task description (must be first)
kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
} omptarget_nvptx_ExplicitTaskDescr;
////////////////////////////////////////////////////////////////////////////////
// Descriptor of a parallel region (worksharing in general)
class omptarget_nvptx_WorkDescr {
public:
// access to data
INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
private:
omptarget_nvptx_TaskDescr masterTaskICV;
};
////////////////////////////////////////////////////////////////////////////////
class omptarget_nvptx_TeamDescr {
public:
// access to data
INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
return &levelZeroTaskDescr;
}
INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
return workDescrForActiveParallel;
}
// init
INLINE void InitTeamDescr();
INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
worker_rootS[wid].DataEnd =
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
// We currently do not have a next slot.
worker_rootS[wid].Next = 0;
worker_rootS[wid].Prev = 0;
worker_rootS[wid].PrevSlotStackPtr = 0;
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
}
private:
omptarget_nvptx_TaskDescr
levelZeroTaskDescr; // icv for team master initial thread
omptarget_nvptx_WorkDescr
workDescrForActiveParallel; // one, ONLY for the active par
ALIGN(16)
__kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number];
};
////////////////////////////////////////////////////////////////////////////////
// thread private data (struct of arrays for better coalescing)
// tid refers here to the global thread id
// do not support multiple concurrent kernel a this time
class omptarget_nvptx_ThreadPrivateContext {
public:
// task
INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
return &levelOneTaskDescr[tid];
}
INLINE void SetTopLevelTaskDescr(int tid,
omptarget_nvptx_TaskDescr *taskICV) {
topTaskDescr[tid] = taskICV;
}
INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
// schedule (for dispatch)
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
INLINE int64_t &Stride(int tid) { return stride[tid]; }
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
INLINE void InitThreadPrivateContext(int tid);
INLINE uint64_t &Cnt() { return cnt; }
private:
// team context for this team
omptarget_nvptx_TeamDescr teamContext;
// task ICV for implicit threads in the only parallel region
omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
// pointer where to find the current task ICV (top of the stack)
omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
// schedule (for dispatch)
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
int64_t chunk[MAX_THREADS_PER_TEAM];
int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
// state for dispatch with dyn/guided OR static (never use both at a time)
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
int64_t stride[MAX_THREADS_PER_TEAM];
uint64_t cnt;
};
/// Memory manager for statically allocated memory.
class omptarget_nvptx_SimpleMemoryManager {
private:
struct MemDataTy {
volatile unsigned keys[OMP_STATE_COUNT];
} MemData[MAX_SM] ALIGN(128);
INLINE static uint32_t hash(unsigned key) {
return key & (OMP_STATE_COUNT - 1);
}
public:
INLINE void Release();
INLINE const void *Acquire(const void *buf, size_t size);
};
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// global data tables
////////////////////////////////////////////////////////////////////////////////
extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
extern uint32_t EXTERN_SHARED(usedMemIdx);
extern uint32_t EXTERN_SHARED(usedSlotIdx);
#if _OPENMP
extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
#else
extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
#endif
extern uint16_t EXTERN_SHARED(threadLimit);
extern uint16_t EXTERN_SHARED(threadsInTeam);
extern uint16_t EXTERN_SHARED(nThreads);
extern omptarget_nvptx_ThreadPrivateContext *
EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
extern int8_t EXTERN_SHARED(execution_param);
extern void *EXTERN_SHARED(ReductionScratchpadPtr);
////////////////////////////////////////////////////////////////////////////////
// work function (outlined parallel/simd functions) and arguments.
// needed for L1 parallelism only.
////////////////////////////////////////////////////////////////////////////////
typedef void *omptarget_nvptx_WorkFn;
extern omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
////////////////////////////////////////////////////////////////////////////////
// get private data structures
////////////////////////////////////////////////////////////////////////////////
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode);
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
////////////////////////////////////////////////////////////////////////////////
// inlined implementation
////////////////////////////////////////////////////////////////////////////////
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
#include "common/omptargeti.h"
#endif

View File

@ -1,223 +0,0 @@
//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the declarations of all library macros, types,
// and functions.
//
//===----------------------------------------------------------------------===//
////////////////////////////////////////////////////////////////////////////////
// Task Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
// sched starts from 1..4; encode it as 0..3; so add 1 here
uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
return (omp_sched_t)rc;
}
INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
// sched starts from 1..4; encode it as 0..3; so sub 1 here
uint8_t val = ((uint8_t)sched) - 1;
// clear current sched
items.flags &= ~TaskDescr_SchedMask;
// set new sched
items.flags |= val;
}
INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
// slow method
// flag:
// default sched is static,
// dyn is off (unused now anyway, but may need to sample from host ?)
// not in parallel
items.flags = 0;
items.threadId = 0; // is master
items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
}
// This is called when all threads are started together in SPMD mode.
// OMP directives include target parallel, target distribute parallel for, etc.
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
// slow method
// flag:
// default sched is static,
// dyn is off (unused now anyway, but may need to sample from host ?)
// in L1 parallel
items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
items.threadId =
__kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
// called for 1st level)
items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
prev = parentTaskDescr;
}
INLINE void omptarget_nvptx_TaskDescr::CopyData(
omptarget_nvptx_TaskDescr *sourceTaskDescr) {
items = sourceTaskDescr->items;
}
INLINE void
omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
CopyData(sourceTaskDescr);
prev = sourceTaskDescr->prev;
}
INLINE void omptarget_nvptx_TaskDescr::CopyParent(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
CopyData(parentTaskDescr);
prev = parentTaskDescr;
}
INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
CopyParent(parentTaskDescr);
items.flags = items.flags & ~TaskDescr_IsParConstr;
ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
}
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
omptarget_nvptx_TaskDescr *masterTaskDescr) {
CopyParent(masterTaskDescr);
// overwrite specific items;
items.flags |=
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
}
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
omptarget_nvptx_TaskDescr *workTaskDescr) {
Copy(workTaskDescr);
//
// overwrite specific items;
//
// The threadID should be __kmpc_get_hardware_thread_id_in_block() %
// GetMasterThreadID(). This is so that the serial master (first lane in the
// master warp) gets a threadId of 0. However, we know that this function is
// always called in a parallel region where only workers are active. The
// serial master thread never enters this region. When a parallel region is
// executed serially, the threadId is set to 0 elsewhere and the
// kmpc_serialized_* functions are called, which never activate this region.
items.threadId =
__kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
// called for 1st level)
}
INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
CopyParent(parentTaskDescr);
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
items.threadId = tid;
}
INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
loopData.loopUpperBound =
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
loopData.nextLowerBound =
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
loopData.schedule =
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
loopData.stride =
omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
}
INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
loopData.loopUpperBound;
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
loopData.nextLowerBound;
omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
loopData.stride;
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
loopData.schedule;
}
////////////////////////////////////////////////////////////////////////////////
// Thread Private Context
////////////////////////////////////////////////////////////////////////////////
INLINE omptarget_nvptx_TaskDescr *
omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
ASSERT0(
LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
"Getting top level, tid is larger than allocated data structure size");
return topTaskDescr[tid];
}
INLINE void
omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
// levelOneTaskDescr is init when starting the parallel region
// top task descr is NULL (team master version will be fixed separately)
topTaskDescr[tid] = NULL;
// the following don't need to be init here; they are init when using dyn
// sched
// current_Event, events_Number, chunk, num_Iterations, schedule
}
////////////////////////////////////////////////////////////////////////////////
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
levelZeroTaskDescr.InitLevelZeroTaskDescr();
}
////////////////////////////////////////////////////////////////////////////////
// Get private data structure for thread
////////////////////////////////////////////////////////////////////////////////
// Utility routines for CUDA threads
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
return omptarget_nvptx_threadPrivateContext->TeamContext();
}
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
return currTeamDescr.WorkDescr();
}
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
}
INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
}
////////////////////////////////////////////////////////////////////////////////
// Memory management runtime functions.
////////////////////////////////////////////////////////////////////////////////
INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
"SlotIdx is too big or uninitialized.");
ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
"MemIdx is too big or uninitialized.");
MemDataTy &MD = MemData[usedSlotIdx];
__kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
}
INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
size_t size) {
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
"SlotIdx is too big or uninitialized.");
const unsigned sm = usedSlotIdx;
MemDataTy &MD = MemData[sm];
unsigned i = hash(GetBlockIdInKernel());
while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
i = hash(i + 1);
}
usedSlotIdx = sm;
usedMemIdx = i;
return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
}

View File

@ -1,31 +0,0 @@
//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Interface to be used in the implementation of OpenMP cancel.
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/debug.h"
#include "interface.h"
EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal) {
PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
// disabled
return 0;
}
EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal) {
PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
// disabled
return 0;
}
#pragma omp end declare target

View File

@ -1,31 +0,0 @@
//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of critical with KMPC interface
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/debug.h"
#include "interface.h"
EXTERN
void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
kmp_CriticalName *lck) {
PRINT0(LD_IO, "call to kmpc_critical()\n");
omp_set_lock((omp_lock_t *)lck);
}
EXTERN
void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
kmp_CriticalName *lck) {
PRINT0(LD_IO, "call to kmpc_end_critical()\n");
omp_unset_lock((omp_lock_t *)lck);
}
#pragma omp end declare target

View File

@ -1,194 +0,0 @@
//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of data sharing environments
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "target/shuffle.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
static constexpr unsigned MinBytes = 8;
static constexpr unsigned Alignment = 8;
/// External symbol to access dynamic shared memory.
extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }
EXTERN void *llvm_omp_get_dynamic_shared() {
return __kmpc_get_dynamic_shared();
}
template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
struct alignas(32) ThreadStackTy {
static constexpr unsigned BytesPerThread = BPerThread;
static constexpr unsigned NumThreads = NThreads;
static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
unsigned char Data[NumThreads][BytesPerThread];
unsigned char Usage[NumThreads];
};
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
MAX_THREADS_PER_TEAM / 4>
WorkerSharedStack;
#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
size_t AlignedBytes = Bytes + (Bytes % MinBytes);
int TID = __kmpc_get_hardware_thread_id_in_block();
if (__kmpc_is_generic_main_thread(TID)) {
// Main thread alone, use shared memory if space is available.
if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
MainSharedStack.Usage[0] += AlignedBytes;
return Ptr;
}
} else if (TID < WorkerSharedStack.NumThreads) {
if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
WorkerSharedStack.Usage[TID] += AlignedBytes;
return Ptr;
}
}
// Fallback to malloc
return SafeMalloc(Bytes, "AllocGlobalFallback");
}
EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
size_t AlignedBytes = Bytes + (Bytes % MinBytes);
int TID = __kmpc_get_hardware_thread_id_in_block();
if (__kmpc_is_generic_main_thread(TID)) {
if (Ptr >= &MainSharedStack.Data[0][0] &&
Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
MainSharedStack.Usage[0] -= AlignedBytes;
return;
}
} else if (TID < WorkerSharedStack.NumThreads) {
if (Ptr >= &WorkerSharedStack.Data[0][0] &&
Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
int TID = __kmpc_get_hardware_thread_id_in_block();
WorkerSharedStack.Usage[TID] -= AlignedBytes;
return;
}
}
SafeFree(Ptr, "FreeGlobalFallback");
}
EXTERN void __kmpc_data_sharing_init_stack() {
for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
MainSharedStack.Usage[i] = 0;
for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
WorkerSharedStack.Usage[i] = 0;
}
/// Allocate storage in shared memory to communicate arguments from the main
/// thread to the workers in generic mode. If we exceed
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64
[[clang::loader_uninitialized]] static void
*SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
#pragma omp allocate(SharedMemVariableSharingSpace) \
allocator(omp_pteam_mem_alloc)
[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
allocator(omp_pteam_mem_alloc)
// Begin a data sharing context. Maintain a list of references to shared
// variables. This list of references to shared variables will be passed
// to one or more threads.
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
} else {
SharedMemVariableSharingSpacePtr =
(void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
}
*GlobalArgs = SharedMemVariableSharingSpacePtr;
}
// End a data sharing context. There is no need to have a list of refs
// to shared variables because the context in which those variables were
// shared has now ended. This should clean-up the list of references only
// without affecting the actual global storage of the variables.
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_end_sharing_variables() {
if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
}
// This function will return a list of references to global variables. This
// is how the workers will get a reference to the globalized variable. The
// members of this list will be passed to the outlined parallel function
// preserving the order.
// Called by all workers.
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
*GlobalArgs = SharedMemVariableSharingSpacePtr;
}
// This function is used to init static memory manager. This manager is used to
// manage statically allocated global memory. This memory is allocated by the
// compiler and used to correctly implement globalization of the variables in
// target, teams and distribute regions.
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
const void *buf, size_t size,
int16_t is_shared,
const void **frame) {
if (is_shared) {
*frame = buf;
return;
}
if (isSPMDExecutionMode) {
if (__kmpc_get_hardware_thread_id_in_block() == 0) {
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
}
__kmpc_impl_syncthreads();
return;
}
ASSERT0(LT_FUSSY,
__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
"Must be called only in the target master thread.");
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
__kmpc_impl_threadfence();
}
EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
int16_t is_shared) {
if (is_shared)
return;
if (isSPMDExecutionMode) {
__kmpc_impl_syncthreads();
if (__kmpc_get_hardware_thread_id_in_block() == 0) {
omptarget_nvptx_simpleMemoryManager.Release();
}
return;
}
__kmpc_impl_threadfence();
ASSERT0(LT_FUSSY,
__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
"Must be called only in the target master thread.");
omptarget_nvptx_simpleMemoryManager.Release();
}
#pragma omp end declare target

View File

@ -1,359 +0,0 @@
//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the OpenMP runtime functions that can be
// invoked by the user in an OpenMP region
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "target_impl.h"
EXTERN double omp_get_wtick(void) {
double rc = __kmpc_impl_get_wtick();
PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
return rc;
}
EXTERN double omp_get_wtime(void) {
double rc = __kmpc_impl_get_wtime();
PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
return rc;
}
EXTERN void omp_set_num_threads(int num) {
// Ignore it for SPMD mode.
if (__kmpc_is_spmd_exec_mode())
return;
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
if (num <= 0) {
WARNING0(LW_INPUT, "expected positive num; ignore\n");
} else if (parallelLevel[GetWarpId()] == 0) {
nThreads = num;
}
}
EXTERN int omp_get_num_threads(void) {
int rc = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
return rc;
}
EXTERN int omp_get_max_threads(void) {
if (parallelLevel[GetWarpId()] > 0)
// We're already in parallel region.
return 1; // default is 1 thread avail
// Not currently in a parallel region, return what was set.
int rc = 1;
if (parallelLevel[GetWarpId()] == 0)
rc = nThreads;
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
return rc;
}
EXTERN int omp_get_thread_limit(void) {
if (__kmpc_is_spmd_exec_mode())
return __kmpc_get_hardware_num_threads_in_block();
int rc = threadLimit;
PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
return rc;
}
EXTERN int omp_get_thread_num() {
int rc = GetOmpThreadId();
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
return rc;
}
EXTERN int omp_get_num_procs(void) {
int rc = GetNumberOfProcsInDevice(__kmpc_is_spmd_exec_mode());
PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
return rc;
}
EXTERN int omp_in_parallel(void) {
int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
return rc;
}
EXTERN int omp_in_final(void) {
// treat all tasks as final... Specs may expect runtime to keep
// track more precisely if a task was actively set by users... This
// is not explicitly specified; will treat as if runtime can
// actively decide to put a non-final task into a final one.
int rc = 1;
PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
return rc;
}
EXTERN void omp_set_dynamic(int flag) {
PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
}
EXTERN int omp_get_dynamic(void) {
int rc = 0;
PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
return rc;
}
EXTERN void omp_set_nested(int flag) {
PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
flag);
}
EXTERN int omp_get_nested(void) {
int rc = 0;
PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
return rc;
}
EXTERN void omp_set_max_active_levels(int level) {
PRINT(LD_IO,
"call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
level);
}
EXTERN int omp_get_max_active_levels(void) {
int rc = 1;
PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
return rc;
}
EXTERN int omp_get_level(void) {
int level = __kmpc_parallel_level();
PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
return level;
}
EXTERN int omp_get_active_level(void) {
int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
return level;
}
EXTERN int omp_get_ancestor_thread_num(int level) {
if (__kmpc_is_spmd_exec_mode())
return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0;
int rc = -1;
// If level is 0 or all parallel regions are not active - return 0.
unsigned parLevel = parallelLevel[GetWarpId()];
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
int totLevel = omp_get_level();
if (level <= totLevel) {
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
int steps = totLevel - level;
PRINT(LD_IO, "backtrack %d steps\n", steps);
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
do {
if (DON(LD_IOD)) {
// print current state
omp_sched_t sched = currTaskDescr->GetRuntimeSched();
PRINT(LD_ALL,
"task descr %s %d: %s, in par %d, rt sched %d,"
" chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
"ancestor", steps,
(currTaskDescr->IsParallelConstruct() ? "par" : "task"),
(int)currTaskDescr->InParallelRegion(), (int)sched,
currTaskDescr->RuntimeChunkSize(),
(int)currTaskDescr->ThreadId(), (int)threadsInTeam,
(int)nThreads);
}
if (currTaskDescr->IsParallelConstruct()) {
// found the level
if (!steps) {
rc = currTaskDescr->ThreadId();
break;
}
steps--;
}
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
} while (currTaskDescr);
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
}
} else if (level == 0 ||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel) ||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
rc = 0;
}
PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
rc)
return rc;
}
EXTERN int omp_get_team_size(int level) {
if (__kmpc_is_spmd_exec_mode())
return level == 1 ? __kmpc_get_hardware_num_threads_in_block() : 1;
int rc = -1;
unsigned parLevel = parallelLevel[GetWarpId()];
// If level is 0 or all parallel regions are not active - return 1.
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
rc = threadsInTeam;
} else if (level == 0 ||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel) ||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
rc = 1;
}
PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
return rc;
}
EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode only with uninitialized runtime.");
*kind = omp_sched_static;
*modifier = 1;
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
*kind = currTaskDescr->GetRuntimeSched();
*modifier = currTaskDescr->RuntimeChunkSize();
}
PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
(int)*kind, *modifier);
}
EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
modifier);
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode only with uninitialized runtime.");
return;
}
if (kind >= omp_sched_static && kind < omp_sched_auto) {
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
currTaskDescr->SetRuntimeSched(kind);
currTaskDescr->RuntimeChunkSize() = modifier;
PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
(int)currTaskDescr->GetRuntimeSched(),
currTaskDescr->RuntimeChunkSize());
}
}
EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
return omp_proc_bind_true;
}
EXTERN int omp_get_num_places(void) {
PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
return 0;
}
EXTERN int omp_get_place_num_procs(int place_num) {
PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
return 0;
}
EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
}
EXTERN int omp_get_place_num(void) {
PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
return 0;
}
EXTERN int omp_get_partition_num_places(void) {
PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
return 0;
}
EXTERN void omp_get_partition_place_nums(int *place_nums) {
PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
}
EXTERN int omp_get_cancellation(void) {
int rc = 0;
PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
return rc;
}
EXTERN void omp_set_default_device(int deviceId) {
PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
}
EXTERN int omp_get_default_device(void) {
PRINT0(LD_IO,
"call omp_get_default_device() is undef on device, returns 0\n");
return 0;
}
EXTERN int omp_get_num_devices(void) {
PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
return 0;
}
EXTERN int omp_get_num_teams(void) {
int rc = GetNumberOfOmpTeams();
PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
return rc;
}
EXTERN int omp_get_team_num() {
int rc = GetOmpTeamId();
PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
return rc;
}
// Unspecified on the device.
EXTERN int omp_get_initial_device(void) {
PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
return 0;
}
// Unused for now.
EXTERN int omp_get_max_task_priority(void) {
PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
return 0;
}
////////////////////////////////////////////////////////////////////////////////
// locks
////////////////////////////////////////////////////////////////////////////////
EXTERN void omp_init_lock(omp_lock_t *lock) {
__kmpc_impl_init_lock(lock);
PRINT0(LD_IO, "call omp_init_lock()\n");
}
EXTERN void omp_destroy_lock(omp_lock_t *lock) {
__kmpc_impl_destroy_lock(lock);
PRINT0(LD_IO, "call omp_destroy_lock()\n");
}
EXTERN void omp_set_lock(omp_lock_t *lock) {
__kmpc_impl_set_lock(lock);
PRINT0(LD_IO, "call omp_set_lock()\n");
}
EXTERN void omp_unset_lock(omp_lock_t *lock) {
__kmpc_impl_unset_lock(lock);
PRINT0(LD_IO, "call omp_unset_lock()\n");
}
EXTERN int omp_test_lock(omp_lock_t *lock) {
int rc = __kmpc_impl_test_lock(lock);
PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
return rc;
}
#pragma omp end declare target

View File

@ -1,813 +0,0 @@
//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of the KMPC interface
// for the loop construct plus other worksharing constructs that use the same
// interface as loops.
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "target/shuffle.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// template class that encapsulate all the helper functions
//
// T is loop iteration type (32 | 64) (unsigned | signed)
// ST is the signed version of T
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
public:
////////////////////////////////////////////////////////////////////////////////
// Loop with static scheduling with chunk
// Generic implementation of OMP loop scheduling with static policy
/*! \brief Calculate initial bounds for static loop and stride
* @param[in] loc location in code of the call (not used here)
* @param[in] global_tid global thread id
* @param[in] schetype type of scheduling (see omptarget-nvptx.h)
* @param[in] plastiter pointer to last iteration
* @param[in,out] pointer to loop lower bound. it will contain value of
* lower bound of first chunk
* @param[in,out] pointer to loop upper bound. It will contain value of
* upper bound of first chunk
* @param[in,out] pointer to loop stride. It will contain value of stride
* between two successive chunks executed by the same thread
* @param[in] loop increment bump
* @param[in] chunk size
*/
// helper function for static chunk
INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
ST chunk, T entityId, T numberOfEntities) {
// each thread executes multiple chunks all of the same size, except
// the last one
// distance between two successive chunks
stride = numberOfEntities * chunk;
lb = lb + entityId * chunk;
T inputUb = ub;
ub = lb + chunk - 1; // Clang uses i <= ub
// Say ub' is the begining of the last chunk. Then who ever has a
// lower bound plus a multiple of the increment equal to ub' is
// the last one.
T beginingLastChunk = inputUb - (inputUb % chunk);
last = ((beginingLastChunk - lb) % stride) == 0;
}
////////////////////////////////////////////////////////////////////////////////
// Loop with static scheduling without chunk
// helper function for static no chunk
INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
ST &chunk, T entityId,
T numberOfEntities) {
// No chunk size specified. Each thread or warp gets at most one
// chunk; chunks are all almost of equal size
T loopSize = ub - lb + 1;
chunk = loopSize / numberOfEntities;
T leftOver = loopSize - chunk * numberOfEntities;
if (entityId < leftOver) {
chunk++;
lb = lb + entityId * chunk;
} else {
lb = lb + entityId * chunk + leftOver;
}
T inputUb = ub;
ub = lb + chunk - 1; // Clang uses i <= ub
last = lb <= inputUb && inputUb <= ub;
stride = loopSize; // make sure we only do 1 chunk per warp
}
////////////////////////////////////////////////////////////////////////////////
// Support for Static Init
INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
int32_t *plastiter, T *plower, T *pupper,
ST *pstride, ST chunk,
bool IsSPMDExecutionMode) {
// When IsRuntimeUninitialized is true, we assume that the caller is
// in an L0 parallel region and that all worker threads participate.
// Assume we are in teams region or that we use a single block
// per target region
ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
// All warps that are in excess of the maximum requested, do
// not execute the loop
PRINT(LD_LOOP,
"OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
"%d, num tids %d\n",
(int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
(int)numberOfActiveOMPThreads);
ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
"current thread is not needed here; error");
// copy
int lastiter = 0;
T lb = *plower;
T ub = *pupper;
ST stride = *pstride;
// init
switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
case kmp_sched_static_chunk: {
if (chunk > 0) {
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
break;
}
} // note: if chunk <=0, use nochunk
case kmp_sched_static_balanced_chunk: {
if (chunk > 0) {
// round up to make sure the chunk is enough to cover all iterations
T tripCount = ub - lb + 1; // +1 because ub is inclusive
T span = (tripCount + numberOfActiveOMPThreads - 1) /
numberOfActiveOMPThreads;
// perform chunk adjustment
chunk = (span + chunk - 1) & ~(chunk - 1);
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
T oldUb = ub;
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
if (ub > oldUb)
ub = oldUb;
break;
}
} // note: if chunk <=0, use nochunk
case kmp_sched_static_nochunk: {
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
break;
}
case kmp_sched_distr_static_chunk: {
if (chunk > 0) {
ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
GetNumberOfOmpTeams());
break;
} // note: if chunk <=0, use nochunk
}
case kmp_sched_distr_static_nochunk: {
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
GetNumberOfOmpTeams());
break;
}
case kmp_sched_distr_static_chunk_sched_static_chunkone: {
ForStaticChunk(lastiter, lb, ub, stride, chunk,
numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
break;
}
default: {
ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
(int)schedtype);
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
numberOfActiveOMPThreads);
break;
}
}
// copy back
*plastiter = lastiter;
*plower = lb;
*pupper = ub;
*pstride = stride;
PRINT(LD_LOOP,
"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
"%d\n",
(int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
(long long)(*plower), (long long)(*pupper), (long long)(*pstride),
(int)lastiter);
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch Init
INLINE static int OrderedSchedule(kmp_sched_t schedule) {
return schedule >= kmp_sched_ordered_first &&
schedule <= kmp_sched_ordered_last;
}
INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
kmp_sched_t schedule, T lb, T ub, ST st,
ST chunk) {
if (isRuntimeUninitialized()) {
// In SPMD mode no need to check parallelism level - dynamic scheduling
// may appear only in L2 parallel regions with lightweight runtime.
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
return;
}
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
T tripCount = ub - lb + 1; // +1 because ub is inclusive
ASSERT0(LT_FUSSY, threadId < tnum,
"current thread is not needed here; error");
/* Currently just ignore the monotonic and non-monotonic modifiers
* (the compiler isn't producing them * yet anyway).
* When it is we'll want to look at them somewhere here and use that
* information to add to our schedule choice. We shouldn't need to pass
* them on, they merely affect which schedule we can legally choose for
* various dynamic cases. (In particular, whether or not a stealing scheme
* is legal).
*/
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
// Process schedule.
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
if (OrderedSchedule(schedule))
__kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
(long)tnum, (long long)tripCount, (int)schedule);
schedule = kmp_sched_static_chunk;
chunk = tripCount; // one thread gets the whole loop
} else if (schedule == kmp_sched_runtime) {
// process runtime
omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
chunk = currTaskDescr->RuntimeChunkSize();
switch (rtSched) {
case omp_sched_static: {
if (chunk > 0)
schedule = kmp_sched_static_chunk;
else
schedule = kmp_sched_static_nochunk;
break;
}
case omp_sched_auto: {
schedule = kmp_sched_static_chunk;
chunk = 1;
break;
}
case omp_sched_dynamic:
case omp_sched_guided: {
schedule = kmp_sched_dynamic;
break;
}
}
PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
(long long)chunk);
} else if (schedule == kmp_sched_auto) {
schedule = kmp_sched_static_chunk;
chunk = 1;
PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
(long long)chunk);
} else {
PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
(long long)chunk);
ASSERT(LT_FUSSY,
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
"unknown schedule %d & chunk %lld\n", (int)schedule,
(long long)chunk);
}
// init schedules
if (schedule == kmp_sched_static_chunk) {
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
// save sched state
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
// save ub
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// compute static chunk
ST stride;
int lastiter = 0;
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
PRINT(LD_LOOP,
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
(int)tnum,
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
(unsigned long long)
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
tid));
} else if (schedule == kmp_sched_static_balanced_chunk) {
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
// save sched state
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
// save ub
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// compute static chunk
ST stride;
int lastiter = 0;
// round up to make sure the chunk is enough to cover all iterations
T span = (tripCount + tnum - 1) / tnum;
// perform chunk adjustment
chunk = (span + chunk - 1) & ~(chunk - 1);
T oldUb = ub;
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
if (ub > oldUb)
ub = oldUb;
// save computed params
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
PRINT(LD_LOOP,
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
(int)tnum,
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
(unsigned long long)
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
tid));
} else if (schedule == kmp_sched_static_nochunk) {
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
// save sched state
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
// save ub
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// compute static chunk
ST stride;
int lastiter = 0;
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
PRINT(LD_LOOP,
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
(int)tnum,
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
(unsigned long long)
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
// save data
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
if (chunk < 1)
chunk = 1;
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
__kmpc_barrier(loc, threadId);
if (tid == 0) {
omptarget_nvptx_threadPrivateContext->Cnt() = 0;
__kmpc_impl_threadfence_block();
}
__kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
", chunk %" PRIu64 "\n",
(int)tnum,
(unsigned long long)
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->Chunk(tid));
}
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch next
INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
int leader) {
uint32_t lo, hi;
__kmpc_impl_unpack(val, lo, hi);
hi = __kmpc_impl_shfl_sync(active, hi, leader);
lo = __kmpc_impl_shfl_sync(active, lo, leader);
return __kmpc_impl_pack(lo, hi);
}
INLINE static uint64_t NextIter() {
__kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
uint32_t leader = __kmpc_impl_ffs(active) - 1;
uint32_t change = __kmpc_impl_popc(active);
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
uint64_t warp_res;
if (rank == 0) {
warp_res = __kmpc_atomic_add(
(unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
(unsigned long long)change);
}
warp_res = Shuffle(active, warp_res, leader);
return warp_res + rank;
}
INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
T loopLowerBound, T loopUpperBound) {
T N = NextIter();
lb = loopLowerBound + N * chunkSize;
ub = lb + chunkSize - 1; // Clang uses i <= ub
// 3 result cases:
// a. lb and ub < loopUpperBound --> NOT_FINISHED
// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
// NOT_FINISHED
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
// a.
if (lb <= loopUpperBound && ub < loopUpperBound) {
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
(long long)lb, (long long)ub, (long long)loopUpperBound);
return NOT_FINISHED;
}
// b.
if (lb <= loopUpperBound) {
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
(long long)lb, (long long)ub, (long long)loopUpperBound);
ub = loopUpperBound;
return LAST_CHUNK;
}
// c. if we are here, we are in case 'c'
lb = loopUpperBound + 2;
ub = loopUpperBound + 1;
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
(long long)ub, (long long)loopUpperBound);
return FINISHED;
}
INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
T *plower, T *pupper, ST *pstride) {
if (isRuntimeUninitialized()) {
// In SPMD mode no need to check parallelism level - dynamic scheduling
// may appear only in L2 parallel regions with lightweight runtime.
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
if (*plast)
return DISPATCH_FINISHED;
*plast = 1;
return DISPATCH_NOTFINISHED;
}
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock();
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
"current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule =
omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
// xxx reduce to one
if (schedule == kmp_sched_static_chunk ||
schedule == kmp_sched_static_nochunk) {
T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
// finished?
if (myLb > ub) {
PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
(long long)myLb, (long long)ub);
return DISPATCH_FINISHED;
}
// not finished, save current bounds
ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
*plower = myLb;
T myUb = myLb + chunk - 1; // Clang uses i <= ub
if (myUb > ub)
myUb = ub;
*pupper = myUb;
*plast = (int32_t)(myUb == ub);
// increment next lower bound by the stride
ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
(long long)*plower, (long long)*pupper);
return DISPATCH_NOTFINISHED;
}
ASSERT0(LT_FUSSY,
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
"bad sched");
T myLb, myUb;
int finished = DynamicNextChunk(
myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
if (finished == FINISHED)
return DISPATCH_FINISHED;
// not finished (either not finished or last chunk)
*plast = (int32_t)(finished == LAST_CHUNK);
*plower = myLb;
*pupper = myUb;
*pstride = 1;
PRINT(LD_LOOP,
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
"last %d\n",
(int)GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
(int)GetNumberOfWorkersInTeam(), (long long)*plower,
(long long)*pupper, (long long)*pstride, (int)*plast);
return DISPATCH_NOTFINISHED;
}
INLINE static void dispatch_fini() {
// nothing
}
////////////////////////////////////////////////////////////////////////////////
// end of template class that encapsulate all the helper functions
////////////////////////////////////////////////////////////////////////////////
};
////////////////////////////////////////////////////////////////////////////////
// KMP interface implementation (dyn loops)
////////////////////////////////////////////////////////////////////////////////
// init
EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
int32_t schedule, int32_t lb, int32_t ub,
int32_t st, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
int32_t schedule, uint32_t lb, uint32_t ub,
int32_t st, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
int32_t schedule, int64_t lb, int64_t ub,
int64_t st, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
int32_t schedule, uint64_t lb, uint64_t ub,
int64_t st, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
// next
EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
uint32_t *p_lb, uint32_t *p_ub,
int32_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
uint64_t *p_lb, uint64_t *p_ub,
int64_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
loc, tid, p_last, p_lb, p_ub, p_st);
}
// fini
EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
}
EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
}
EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
}
EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
}
////////////////////////////////////////////////////////////////////////////////
// KMP interface implementation (static loops)
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
uint64_t *plower, uint64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter, int32_t *plower,
int32_t *pupper, int32_t *pstride,
int32_t incr, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_distribute_static_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter,
uint32_t *plower, uint32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_distribute_static_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter, int64_t *plower,
int64_t *pupper, int64_t *pstride,
int64_t incr, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_distribute_static_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter,
uint64_t *plower, uint64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_distribute_static_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
__kmpc_is_spmd_exec_mode());
}
EXTERN
void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true);
}
EXTERN
void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter, uint32_t *plower,
uint32_t *pupper, int32_t *pstride,
int32_t incr, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true);
}
EXTERN
void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true);
}
EXTERN
void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter, uint64_t *plower,
uint64_t *pupper, int64_t *pstride,
int64_t incr, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true);
}
EXTERN
void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false);
}
EXTERN
void __kmpc_for_static_init_4u_simple_generic(
kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
int32_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false);
}
EXTERN
void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
int32_t schedtype,
int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false);
}
EXTERN
void __kmpc_for_static_init_8u_simple_generic(
kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
int64_t chunk) {
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false);
}
EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_distribute_static_fini\n");
}
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_for_static_fini\n");
}
#pragma omp end declare target

View File

@ -1,65 +0,0 @@
//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the data objects used on the GPU device.
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/allocator.h"
#include "common/omptarget.h"
////////////////////////////////////////////////////////////////////////////////
// global device environment
////////////////////////////////////////////////////////////////////////////////
PLUGIN_ACCESSIBLE
DeviceEnvironmentTy omptarget_device_environment;
////////////////////////////////////////////////////////////////////////////////
// global data holding OpenMP state information
////////////////////////////////////////////////////////////////////////////////
// OpenMP will try to call its ctor if we don't add the attribute explicitly
[[clang::loader_uninitialized]] omptarget_nvptx_Queue<
omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
uint32_t SHARED(usedMemIdx);
uint32_t SHARED(usedSlotIdx);
// SHARED doesn't work with array so we add the attribute explicitly.
[[clang::loader_uninitialized]] uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
uint16_t SHARED(threadLimit);
uint16_t SHARED(threadsInTeam);
uint16_t SHARED(nThreads);
// Pointer to this team's OpenMP state object
omptarget_nvptx_ThreadPrivateContext *
SHARED(omptarget_nvptx_threadPrivateContext);
////////////////////////////////////////////////////////////////////////////////
// The team master sets the outlined parallel function in this variable to
// communicate with the workers. Since it is in shared memory, there is one
// copy of these variables for each kernel, instance, and team.
////////////////////////////////////////////////////////////////////////////////
omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
////////////////////////////////////////////////////////////////////////////////
// OpenMP kernel execution parameters
////////////////////////////////////////////////////////////////////////////////
int8_t SHARED(execution_param);
////////////////////////////////////////////////////////////////////////////////
// Scratchpad for teams reduction.
////////////////////////////////////////////////////////////////////////////////
void *SHARED(ReductionScratchpadPtr);
#pragma omp end declare target

View File

@ -1,259 +0,0 @@
//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the initialization code for the GPU
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "common/support.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// global data tables
////////////////////////////////////////////////////////////////////////////////
extern omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext,
OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
////////////////////////////////////////////////////////////////////////////////
// init entry points
////////////////////////////////////////////////////////////////////////////////
static void __kmpc_generic_kernel_init() {
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
OMPTARGET_NVPTX_VERSION);
if (GetLaneId() == 0)
parallelLevel[GetWarpId()] = 0;
int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
if (threadIdInBlock != GetMasterThreadID())
return;
setExecutionParameters(OMP_TGT_EXEC_MODE_GENERIC, OMP_TGT_RUNTIME_INITIALIZED);
ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
"__kmpc_kernel_init() must be called by team master warp only!");
PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
// Get a state object from the queue.
int slot = __kmpc_impl_smid() % MAX_SM;
usedSlotIdx = slot;
omptarget_nvptx_threadPrivateContext =
omptarget_nvptx_device_State[slot].Dequeue();
// init thread private
int threadId = 0;
omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
// init team context
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
currTeamDescr.InitTeamDescr();
// this thread will start execution... has to update its task ICV
// to point to the level zero task ICV. That ICV was init in
// InitTeamDescr()
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTeamDescr.LevelZeroTaskDescr());
// set number of threads and thread limit in team to started value
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
nThreads = GetNumberOfWorkersInTeam();
threadLimit = nThreads;
__kmpc_data_sharing_init_stack();
__kmpc_impl_target_init();
}
static void __kmpc_generic_kernel_deinit() {
PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
// Enqueue omp state object for use by another team.
int slot = usedSlotIdx;
omptarget_nvptx_device_State[slot].Enqueue(
omptarget_nvptx_threadPrivateContext);
// Done with work. Kill the workers.
omptarget_nvptx_workFn = 0;
}
static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) {
PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
setExecutionParameters(OMP_TGT_EXEC_MODE_SPMD,
RequiresFullRuntime ? OMP_TGT_RUNTIME_INITIALIZED
: OMP_TGT_RUNTIME_UNINITIALIZED);
int threadId = __kmpc_get_hardware_thread_id_in_block();
if (threadId == 0) {
usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
}
if (GetLaneId() == 0) {
parallelLevel[GetWarpId()] =
1 + (__kmpc_get_hardware_num_threads_in_block() > 1
? OMP_ACTIVE_PARALLEL_LEVEL
: 0);
}
__kmpc_data_sharing_init_stack();
if (!RequiresFullRuntime)
return;
//
// Team Context Initialization.
//
// In SPMD mode there is no master thread so use any cuda thread for team
// context initialization.
if (threadId == 0) {
// Get a state object from the queue.
omptarget_nvptx_threadPrivateContext =
omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
// init team context
currTeamDescr.InitTeamDescr();
}
__kmpc_impl_syncthreads();
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
//
// Initialize task descr for each thread.
//
omptarget_nvptx_TaskDescr *newTaskDescr =
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
// install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
// init thread private from init value
int ThreadLimit = GetNumberOfProcsInTeam(/* IsSPMD */ true);
PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
(int)newTaskDescr->ThreadId(), (int)ThreadLimit);
}
static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) {
// We're not going to pop the task descr stack of each thread since
// there are no more parallel regions in SPMD mode.
if (!RequiresFullRuntime)
return;
__kmpc_impl_syncthreads();
int threadId = __kmpc_get_hardware_thread_id_in_block();
if (threadId == 0) {
// Enqueue omp state object for use by another team.
int slot = usedSlotIdx;
omptarget_nvptx_device_State[slot].Enqueue(
omptarget_nvptx_threadPrivateContext);
}
}
// Return true if the current target region is executed in SPMD mode.
// NOTE: This function has to return 1 for SPMD mode, and 0 for generic mode.
// That's because `__kmpc_parallel_51` checks if it's already in parallel region
// by comparision between the parallel level and the return value of this
// function.
EXTERN int8_t __kmpc_is_spmd_exec_mode() {
return (execution_param & OMP_TGT_EXEC_MODE_SPMD) == OMP_TGT_EXEC_MODE_SPMD;
}
EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
return !__kmpc_is_spmd_exec_mode() && __kmpc_is_generic_main_thread_id(Tid);
}
NOINLINE EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid) {
return GetMasterThreadID() == Tid;
}
EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
static void __kmpc_target_region_state_machine(ident_t *Ident) {
int TId = __kmpc_get_hardware_thread_id_in_block();
do {
void* WorkFn = 0;
// Wait for the signal that we have a new work function.
__kmpc_barrier_simple_spmd(Ident, TId);
// Retrieve the work function from the runtime.
bool IsActive = __kmpc_kernel_parallel(&WorkFn);
// If there is nothing more to do, break out of the state machine by
// returning to the caller.
if (!WorkFn)
return;
if (IsActive) {
((void(*)(uint32_t,uint32_t))WorkFn)(0, TId);
__kmpc_kernel_end_parallel();
}
__kmpc_barrier_simple_spmd(Ident, TId);
} while (true);
}
EXTERN
int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
bool UseGenericStateMachine,
bool RequiresFullRuntime) {
const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
int TId = __kmpc_get_hardware_thread_id_in_block();
if (IsSPMD)
__kmpc_spmd_kernel_init(RequiresFullRuntime);
else
__kmpc_generic_kernel_init();
if (IsSPMD) {
__kmpc_barrier_simple_spmd(Ident, TId);
return -1;
}
if (TId == GetMasterThreadID())
return -1;
// Enter the generic state machine if enabled and if this thread can possibly
// be an active worker thread.
//
// The latter check is important for NVIDIA Pascal (but not Volta) and AMD
// GPU. In those cases, a single thread can apparently satisfy a barrier on
// behalf of all threads in the same warp. Thus, it would not be safe for
// other threads in the main thread's warp to reach the first
// __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine
// before the main thread reaches its corresponding
// __kmpc_barrier_simple_spmd call: that would permit all active worker
// threads to proceed before the main thread has actually set
// omptarget_nvptx_workFn, and then they would immediately quit without
// doing any work. GetNumberOfWorkersInTeam() does not include any of the
// main thread's warp, so none of its threads can ever be active worker
// threads.
if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam())
__kmpc_target_region_state_machine(Ident);
return TId;
}
EXTERN
void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
bool RequiresFullRuntime) {
const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
if (IsSPMD)
__kmpc_spmd_kernel_deinit(RequiresFullRuntime);
else
__kmpc_generic_kernel_deinit();
}
#pragma omp end declare target

View File

@ -1,341 +0,0 @@
//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Parallel implementation in the GPU. Here is the pattern:
//
// while (not finished) {
//
// if (master) {
// sequential code, decide which par loop to do, or if finished
// __kmpc_kernel_prepare_parallel() // exec by master only
// }
// syncthreads // A
// __kmpc_kernel_parallel() // exec by all
// if (this thread is included in the parallel) {
// switch () for all parallel loops
// __kmpc_kernel_end_parallel() // exec only by threads in parallel
// }
//
//
// The reason we don't exec end_parallel for the threads not included
// in the parallel loop is that for each barrier in the parallel
// region, these non-included threads will cycle through the
// syncthread A. Thus they must preserve their current threadId that
// is larger than thread in team.
//
// To make a long story short...
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////
INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
uint16_t NThreadsICV,
uint16_t ThreadLimit) {
uint16_t ThreadsRequested = NThreadsICV;
if (NumThreadsClause != 0) {
ThreadsRequested = NumThreadsClause;
}
uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
ThreadsAvailable = ThreadLimit;
}
uint16_t NumThreads = ThreadsAvailable;
if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
NumThreads = ThreadsRequested;
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// On Volta and newer architectures we require that all lanes in
// a warp participate in the parallel region. Round down to a
// multiple of WARPSIZE since it is legal to do so in OpenMP.
if (NumThreads < WARPSIZE) {
NumThreads = 1;
} else {
NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
}
#endif
return NumThreads;
}
// This routine is always called by the team master..
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
kmp_int32 NumThreadsClause) {
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
omptarget_nvptx_workFn = WorkFn;
// This routine is only called by the team master. The team master is
// the first thread of the last warp. It always has the logical thread
// id of 0 (since it is a shadow for the first worker thread).
const int threadId = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
"cannot be called in a parallel region.");
if (currTaskDescr->InParallelRegion()) {
PRINT0(LD_PAR, "already in parallel: go seq\n");
return;
}
uint16_t NumThreads =
determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
if (NumThreadsClause != 0) {
// Reset request to avoid propagating to successive #parallel
NumThreadsClause = 0;
}
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
(int)NumThreads);
ASSERT0(LT_FUSSY,
__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
"only team master can create parallel");
// Set number of threads on work descriptor.
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
threadsInTeam = NumThreads;
}
// All workers call this function. Deactivate those not needed.
// Fn - the outlined work function to execute.
// returns True if this thread is active, else False.
//
// Only the worker threads call this routine.
EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
// Work function and arguments for L1 parallel region.
*WorkFn = omptarget_nvptx_workFn;
// If this is the termination signal from the master, quit early.
if (!*WorkFn) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
return false;
}
// Only the worker threads call this routine and the master warp
// never arrives here. Therefore, use the nvptx thread id.
int threadId = __kmpc_get_hardware_thread_id_in_block();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
// Set to true for workers participating in the parallel region.
bool isActive = false;
// Initialize state for active threads.
if (threadId < threadsInTeam) {
// init work descriptor from workdesccr
omptarget_nvptx_TaskDescr *newTaskDescr =
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
// install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
// init private from int value
PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
(int)newTaskDescr->ThreadId(), (int)nThreads);
isActive = true;
}
return isActive;
}
EXTERN void __kmpc_kernel_end_parallel() {
// pop stack
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
// Only the worker threads call this routine and the master warp
// never arrives here. Therefore, use the nvptx thread id.
int threadId = __kmpc_get_hardware_thread_id_in_block();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
}
////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes sequential
////////////////////////////////////////////////////////////////////////////////
static void serializedParallel(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to serializedParallel\n");
IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode with uninitialized runtime.");
return;
}
// assume this is only called for nested parallel
int threadId = GetLogicalThreadIdInBlock();
// unlike actual parallel, threads in the same team do not share
// the workTaskDescr in this case and num threads is fixed to 1
// get current task
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
currTaskDescr->SaveLoopData();
// allocate new task descriptor and copy value from current one, set prev to
// it
omptarget_nvptx_TaskDescr *newTaskDescr =
(omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
"new seq parallel task");
newTaskDescr->CopyParent(currTaskDescr);
// tweak values for serialized parallel case:
// - each thread becomes ID 0 in its serialized parallel, and
// - there is only one thread per team
newTaskDescr->ThreadId() = 0;
// set new task descriptor as top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
}
static void endSerializedParallel(kmp_Ident *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to endSerializedParallel\n");
DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode with uninitialized runtime.");
return;
}
// pop stack
int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
// set new top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
// free
SafeFree(currTaskDescr, "new seq parallel task");
currTaskDescr = getMyTopTaskDescriptor(threadId);
currTaskDescr->RestoreLoopData();
}
NOINLINE EXTERN uint8_t __kmpc_parallel_level() {
return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
}
// This kmpc call returns the thread id across all teams. It's value is
// cached by the compiler and used when calling the runtime. On nvptx
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
return GetOmpThreadId();
}
////////////////////////////////////////////////////////////////////////////////
// push params
////////////////////////////////////////////////////////////////////////////////
// Do nothing. The host guarantees we started the requested number of
// teams and we only need inspection of gridDim.
EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
int32_t num_teams, int32_t thread_limit) {
PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device");
}
EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) {
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
}
////////////////////////////////////////////////////////////////////////////////
// parallel interface
////////////////////////////////////////////////////////////////////////////////
NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
kmp_int32 if_expr,
kmp_int32 num_threads, int proc_bind,
void *fn, void *wrapper_fn, void **args,
size_t nargs) {
// Handle the serialized case first, same for SPMD/non-SPMD except that in
// SPMD mode we already incremented the parallel level counter, account for
// that.
bool InParallelRegion =
(__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode());
if (!if_expr || InParallelRegion) {
serializedParallel(ident, global_tid);
__kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
endSerializedParallel(ident, global_tid);
return;
}
if (__kmpc_is_spmd_exec_mode()) {
__kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
return;
}
__kmpc_kernel_prepare_parallel((void *)wrapper_fn, num_threads);
if (nargs) {
void **GlobalArgs;
__kmpc_begin_sharing_variables(&GlobalArgs, nargs);
// TODO: faster memcpy?
#pragma unroll
for (int I = 0; I < nargs; I++)
GlobalArgs[I] = args[I];
}
// TODO: what if that's a parallel region with a single thread? this is
// considered not active in the existing implementation.
bool IsActiveParallelRegion = threadsInTeam != 1;
int NumWarps =
threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0);
// Increment parallel level for non-SPMD warps.
for (int I = 0; I < NumWarps; ++I)
parallelLevel[I] +=
(1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
// Master signals work to activate workers.
__kmpc_barrier_simple_spmd(ident, 0);
// OpenMP [2.5, Parallel Construct, p.49]
// There is an implied barrier at the end of a parallel region. After the
// end of a parallel region, only the master thread of the team resumes
// execution of the enclosing task region.
//
// The master waits at this barrier until all workers are done.
__kmpc_barrier_simple_spmd(ident, 0);
// Decrement parallel level for non-SPMD warps.
for (int I = 0; I < NumWarps; ++I)
parallelLevel[I] -=
(1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
// TODO: Is synchronization needed since out of parallel execution?
if (nargs)
__kmpc_end_sharing_variables();
// TODO: proc_bind is a noop?
// if (proc_bind != proc_bind_default)
// __kmpc_push_proc_bind(ident, global_tid, proc_bind);
}
#pragma omp end declare target

View File

@ -1,309 +0,0 @@
//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of reduction with KMPC interface.
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "target/shuffle.h"
#include "target_impl.h"
EXTERN
void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
EXTERN
void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
INLINE static void gpu_regular_warp_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr shflFct) {
for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
shflFct(reduce_data, /*LaneId - not used= */ 0,
/*Offset = */ mask, /*AlgoVersion=*/0);
}
}
INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,
uint32_t size, uint32_t tid) {
uint32_t curr_size;
uint32_t mask;
curr_size = size;
mask = curr_size / 2;
while (mask > 0) {
shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
curr_size = (curr_size + 1) / 2;
mask = curr_size / 2;
}
}
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
INLINE static uint32_t
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
__kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
do {
Liveness = __kmpc_impl_activemask();
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
size = __kmpc_impl_popc(Liveness);
logical_lane_id /= 2;
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
} while (logical_lane_id % 2 == 0 && size > 1);
return (logical_lane_id == 0);
}
#endif
INLINE
static int32_t nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
if (NumThreads == 1)
return 1;
/*
* This reduce function handles reduction within a team. It handles
* parallel regions in both L1 and L2 parallelism levels. It also
* supports Generic, SPMD, and NoOMP modes.
*
* 1. Reduce within a warp.
* 2. Warp master copies value to warp 0 via shared memory.
* 3. Warp 0 reduces to a single value.
* 4. The reduced value is available in the thread that returns 1.
*/
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
uint32_t WarpId = BlockThreadId / WARPSIZE;
// Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
gpu_irregular_warp_reduce(
reduce_data, shflFct,
/*LaneCount=*/NumThreads % WARPSIZE,
/*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > WARPSIZE) {
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
}
return BlockThreadId == 0;
#else
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == __kmpc_impl_all_lanes) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(
reduce_data, shflFct,
/*LaneCount=*/__kmpc_impl_popc(Liveness),
/*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
// parallel region may enter here; return
// early.
return gpu_irregular_simd_reduce(reduce_data, shflFct);
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > WARPSIZE) {
uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
uint32_t WarpId = BlockThreadId / WARPSIZE;
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
return BlockThreadId == 0;
} else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
return BlockThreadId == 0;
}
// Get the OMP thread Id. This is different from BlockThreadId in the case of
// an L2 parallel region.
return global_tid == 0;
#endif // __CUDA_ARCH__ >= 700
}
EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct) {
return nvptx_parallel_reduce_nowait(
global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
__kmpc_is_spmd_exec_mode(), isRuntimeUninitialized());
}
INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId);
}
INLINE static uint32_t roundToWarpsize(uint32_t s) {
if (s < WARPSIZE)
return 1;
return (s & ~(unsigned)(WARPSIZE - 1));
}
INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
static volatile uint32_t IterCnt = 0;
static volatile uint32_t Cnt = 0;
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
kmp_ListGlobalFctPtr glredFct) {
// Terminate all threads in non-SPMD mode except for the master thread.
if (!__kmpc_is_spmd_exec_mode() &&
!__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
return 0;
uint32_t ThreadId = GetLogicalThreadIdInBlock();
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =
__kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = __kmpc_get_hardware_num_blocks();
static unsigned SHARED(Bound);
static unsigned SHARED(ChunkTeamCount);
// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal
// to the number of slots in the buffer.
bool IsMaster = isMaster(loc, ThreadId);
while (IsMaster) {
// Atomic read
Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
if (TeamId < Bound + num_of_records)
break;
}
if (IsMaster) {
int ModBockId = TeamId % num_of_records;
if (TeamId < num_of_records)
lgcpyFct(global_buffer, ModBockId, reduce_data);
else
lgredFct(global_buffer, ModBockId, reduce_data);
__kmpc_impl_threadfence_system();
// Increment team counter.
// This counter is incremented by all teams in the current
// BUFFER_SIZE chunk.
ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
}
// Synchronize
if (__kmpc_is_spmd_exec_mode())
__kmpc_barrier(loc, global_tid);
// reduce_data is global or shared so before being reduced within the
// warp we need to bring it in local memory:
// local_reduce_data = reduce_data[i]
//
// Example for 3 reduction variables a, b, c (of potentially different
// types):
//
// buffer layout (struct of arrays):
// a, a, ..., a, b, b, ... b, c, c, ... c
// |__________|
// num_of_records
//
// local_data_reduce layout (struct):
// a, b, c
//
// Each thread will have a local struct containing the values to be
// reduced:
// 1. do reduction within each warp.
// 2. do reduction across warps.
// 3. write the final result to the main reduction variable
// by returning 1 in the thread holding the reduction result.
// Check if this is the very last team.
unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
if (ChunkTeamCount == NumTeams - Bound - 1) {
//
// Last team processing.
//
if (ThreadId >= NumRecs)
return 0;
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
if (ThreadId >= NumThreads)
return 0;
// Load from buffer and reduce.
glcpyFct(global_buffer, ThreadId, reduce_data);
for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
glredFct(global_buffer, i, reduce_data);
// Reduce across warps to the warp master.
if (NumThreads > 1) {
gpu_regular_warp_reduce(reduce_data, shflFct);
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
if (ActiveThreads > WARPSIZE) {
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
uint32_t WarpId = ThreadId / WARPSIZE;
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
ThreadId);
}
}
if (IsMaster) {
Cnt = 0;
IterCnt = 0;
return 1;
}
return 0;
}
if (IsMaster && ChunkTeamCount == num_of_records - 1) {
// Allow SIZE number of teams to proceed writing their
// intermediate results to the global buffer.
__kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
}
return 0;
}
#pragma omp end declare target

View File

@ -1,29 +0,0 @@
//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "target/shuffle.h"
#pragma omp declare target
static constexpr uint64_t AllLanes = -1;
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size);
}
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
uint32_t lo, hi;
__kmpc_impl_unpack(val, lo, hi);
hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size);
lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size);
return __kmpc_impl_pack(lo, hi);
}
#pragma omp end declare target

View File

@ -1,240 +0,0 @@
//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Wrapper implementation to some functions natively supported by the GPU.
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/debug.h"
#include "common/omptarget.h"
#include "common/support.h"
////////////////////////////////////////////////////////////////////////////////
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
void setExecutionParameters(OMPTgtExecModeFlags EMode,
OMPTgtRuntimeModeFlags RMode) {
execution_param = EMode;
execution_param |= RMode;
}
bool isGenericMode() { return execution_param & OMP_TGT_EXEC_MODE_GENERIC; }
bool isRuntimeUninitialized() { return !isRuntimeInitialized(); }
bool isRuntimeInitialized() {
return execution_param & OMP_TGT_RUNTIME_INITIALIZED;
}
////////////////////////////////////////////////////////////////////////////////
// support: get info from machine
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//
// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
//
////////////////////////////////////////////////////////////////////////////////
// The master thread id is the first thread (lane) of the last warp.
// Thread id is 0 indexed.
// E.g: If NumThreads is 33, master id is 32.
// If NumThreads is 64, master id is 32.
// If NumThreads is 97, master id is 96.
// If NumThreads is 1024, master id is 992.
//
// Called in Generic Execution Mode only.
int GetMasterThreadID() {
return (__kmpc_get_hardware_num_threads_in_block() - 1) & ~(WARPSIZE - 1);
}
// The last warp is reserved for the master; other warps are workers.
// Called in Generic Execution Mode only.
int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
////////////////////////////////////////////////////////////////////////////////
// get thread id in team
// This function may be called in a parallel region by the workers
// or a serial region by the master. If the master (whose CUDA thread
// id is GetMasterThreadID()) calls this routine, we return 0 because
// it is a shadow for the first worker.
int GetLogicalThreadIdInBlock() {
// Implemented using control flow (predication) instead of with a modulo
// operation.
int tid = __kmpc_get_hardware_thread_id_in_block();
if (__kmpc_is_generic_main_thread(tid))
return 0;
else
return tid;
}
////////////////////////////////////////////////////////////////////////////////
//
// OpenMP Thread Support Layer
//
////////////////////////////////////////////////////////////////////////////////
int GetOmpThreadId() {
int tid = __kmpc_get_hardware_thread_id_in_block();
if (__kmpc_is_generic_main_thread(tid))
return 0;
// omp_thread_num
int rc;
if (__kmpc_parallel_level() > 1) {
rc = 0;
} else if (__kmpc_is_spmd_exec_mode()) {
rc = tid;
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
rc = currTaskDescr->ThreadId();
}
return rc;
}
int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
// omp_num_threads
int rc;
int Level = parallelLevel[GetWarpId()];
if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
rc = 1;
} else if (isSPMDExecutionMode) {
rc = __kmpc_get_hardware_num_threads_in_block();
} else {
rc = threadsInTeam;
}
return rc;
}
////////////////////////////////////////////////////////////////////////////////
// Team id linked to OpenMP
int GetOmpTeamId() {
// omp_team_num
return GetBlockIdInKernel(); // assume 1 block per team
}
int GetNumberOfOmpTeams() {
// omp_num_teams
return __kmpc_get_hardware_num_blocks(); // assume 1 block per team
}
////////////////////////////////////////////////////////////////////////////////
// Masters
int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
////////////////////////////////////////////////////////////////////////////////
// Parallel level
void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
__kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] +=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__kmpc_impl_threadfence();
}
__kmpc_impl_syncwarp(Mask);
}
void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
__kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] -=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__kmpc_impl_threadfence();
}
__kmpc_impl_syncwarp(Mask);
}
////////////////////////////////////////////////////////////////////////////////
// get OpenMP number of procs
// Get the number of processors in the device.
int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
if (!isSPMDExecutionMode)
return GetNumberOfWorkersInTeam();
return __kmpc_get_hardware_num_threads_in_block();
}
int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
return GetNumberOfProcsInDevice(isSPMDExecutionMode);
}
////////////////////////////////////////////////////////////////////////////////
// Memory
////////////////////////////////////////////////////////////////////////////////
unsigned long PadBytes(unsigned long size,
unsigned long alignment) // must be a power of 2
{
// compute the necessary padding to satisfy alignment constraint
ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
"alignment %lu is not a power of 2\n", alignment);
return (~(unsigned long)size + 1) & (alignment - 1);
}
void *SafeMalloc(size_t size, const char *msg) // check if success
{
void *ptr = __kmpc_impl_malloc(size);
PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
(unsigned long long)size, msg, (unsigned long long)ptr);
return ptr;
}
void *SafeFree(void *ptr, const char *msg) {
PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
__kmpc_impl_free(ptr);
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
// Teams Reduction Scratchpad Helpers
////////////////////////////////////////////////////////////////////////////////
unsigned int *GetTeamsReductionTimestamp() {
return static_cast<unsigned int *>(ReductionScratchpadPtr);
}
char *GetTeamsReductionScratchpad() {
return static_cast<char *>(ReductionScratchpadPtr) + 256;
}
// Invoke an outlined parallel function unwrapping arguments (up
// to 32).
void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
void **args, size_t nargs) {
switch (nargs) {
#include "common/generated_microtask_cases.gen"
default:
printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
__builtin_trap();
}
}
namespace _OMP {
/// Helper to keep code alive without introducing a performance penalty.
__attribute__((used, retain, weak, optnone, cold)) void keepAlive() {
__kmpc_get_hardware_thread_id_in_block();
__kmpc_get_hardware_num_threads_in_block();
__kmpc_get_warp_size();
__kmpc_barrier_simple_spmd(nullptr, 0);
__kmpc_barrier_simple_generic(nullptr, 0);
}
} // namespace _OMP
#pragma omp end declare target

View File

@ -1,143 +0,0 @@
//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Include all synchronization.
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// KMP Ordered calls
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_ordered\n");
}
EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_end_ordered\n");
}
////////////////////////////////////////////////////////////////////////////////
// KMP Barriers
////////////////////////////////////////////////////////////////////////////////
// a team is a block: we can use CUDA native synchronization mechanism
// FIXME: what if not all threads (warps) participate to the barrier?
// We may need to implement it differently
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
__kmpc_barrier(loc_ref, tid);
PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
return 0;
}
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock();
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
if (numberOfActiveOMPThreads > 1) {
if (__kmpc_is_spmd_exec_mode()) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
PRINT(LD_SYNC,
"call kmpc_barrier with %d omp threads, sync parameter %d\n",
(int)numberOfActiveOMPThreads, (int)threads);
__kmpc_impl_named_sync(threads);
}
} else {
// Still need to flush the memory per the standard.
__kmpc_flush(loc_ref);
} // numberOfActiveOMPThreads > 1
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
}
// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
// parallel region and that all worker threads participate.
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
__kmpc_impl_syncthreads();
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
}
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
return __kmpc_barrier_simple_spmd(loc_ref, tid);
}
////////////////////////////////////////////////////////////////////////////////
// KMP MASTER
////////////////////////////////////////////////////////////////////////////////
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_master\n");
return IsTeamMaster(global_tid);
}
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_master\n");
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
}
////////////////////////////////////////////////////////////////////////////////
// KMP SINGLE
////////////////////////////////////////////////////////////////////////////////
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_single\n");
// decide to implement single with master; master get the single
return IsTeamMaster(global_tid);
}
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_single\n");
// decide to implement single with master: master get the single
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
// sync barrier is explicitly called... so that is not a problem
}
////////////////////////////////////////////////////////////////////////////////
// Flush
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_flush(kmp_Ident *loc) {
PRINT0(LD_IO, "call kmpc_flush\n");
__kmpc_impl_threadfence();
}
////////////////////////////////////////////////////////////////////////////////
// Vote
////////////////////////////////////////////////////////////////////////////////
EXTERN uint64_t __kmpc_warp_active_thread_mask(void) {
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
return __kmpc_impl_activemask();
}
////////////////////////////////////////////////////////////////////////////////
// Syncwarp
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_syncwarp(uint64_t Mask) {
PRINT0(LD_IO, "call __kmpc_syncwarp\n");
__kmpc_impl_syncwarp(Mask);
}
#pragma omp end declare target

View File

@ -1,219 +0,0 @@
//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Task implementation support.
//
// explicit task structure uses
// omptarget_nvptx task
// kmp_task
//
// where kmp_task is
// - klegacy_TaskDescr <- task pointer
// shared -> X
// routine
// part_id
// descr
// - private (of size given by task_alloc call). Accessed by
// task+sizeof(klegacy_TaskDescr)
// * private data *
// - shared: X. Accessed by shared ptr in klegacy_TaskDescr
// * pointer table to shared variables *
// - end
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/omptarget.h"
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
kmp_Ident *loc, // unused
uint32_t global_tid, // unused
int32_t flag, // unused (because in our impl, all are immediately exec
size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
kmp_TaskFctPtr taskSub) {
PRINT(LD_IO,
"call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
"fct 0x%llx)\n",
(long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
(unsigned long long)taskSub);
// want task+priv to be a multiple of 8 bytes
size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
sizeOfTaskInclPrivate += padForTaskInclPriv;
size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
"need task descr of size %d to be a multiple of %d\n",
(int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
totSize, "explicit task descriptor");
kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
ASSERT0(LT_FUSSY,
(uint64_t)newKmpTaskDescr ==
(uint64_t)ADD_BYTES(newExplicitTaskDescr,
sizeof(omptarget_nvptx_TaskDescr)),
"bad size assumptions");
// init kmp_TaskDescr
newKmpTaskDescr->sharedPointerTable =
(void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
newKmpTaskDescr->sub = taskSub;
newKmpTaskDescr->destructors = NULL;
PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
(unsigned long long)newKmpTaskDescr,
(unsigned long long)newExplicitTaskDescr);
return newKmpTaskDescr;
}
EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
0);
}
EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr,
int32_t depNum, void *depList,
int32_t noAliasDepNum,
void *noAliasDepList) {
PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
P64(newKmpTaskDescr));
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
// 1. get explicit task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
"bad assumptions");
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
"bad assumptions");
// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
// 3. call sub
PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
(unsigned long long)newKmpTaskDescr->sub,
(unsigned long long)newKmpTaskDescr);
newKmpTaskDescr->sub(0, newKmpTaskDescr);
PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
(unsigned long long)newKmpTaskDescr->sub);
// 4. pop context
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
parentTaskDescr);
// 5. free
SafeFree(newExplicitTaskDescr, "explicit task descriptor");
return 0;
}
EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
(unsigned long long)newKmpTaskDescr);
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
// 1. get explicit task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
"bad assumptions");
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
"bad assumptions");
// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
// 3... noting to call... is inline
// 4 & 5 ... done in complete
}
EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
(unsigned long long)newKmpTaskDescr);
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
// 1. get explicit task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
"bad assumptions");
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
"bad assumptions");
// 2. get parent
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
// 3... noting to call... is inline
// 4. pop context
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
parentTaskDescr);
// 5. free
SafeFree(newExplicitTaskDescr, "explicit task descriptor");
}
EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
int32_t depNum, void *depList,
int32_t noAliasDepNum, void *noAliasDepList) {
PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
// nothing to do as all our tasks are executed as final
}
EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
// nothing to do as all our tasks are executed as final
}
EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
// nothing to do as all our tasks are executed as final
}
EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
int end_part) {
PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
// do nothing: tasks are executed immediately, no yielding allowed
return 0;
}
EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
// nothing to do as all our tasks are executed as final
return 0;
}
EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr, int if_val,
uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
int32_t sched, uint64_t grainsize, void *task_dup) {
// skip task entirely if empty iteration space
if (*lb > *ub)
return;
// the compiler has already stored lb and ub in the kmp_TaskDescr structure
// as we are using a single task to execute the entire loop, we can leave
// the initial task_t untouched
__kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
}
#pragma omp end declare target

View File

@ -1,51 +0,0 @@
//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains a queue to hand out OpenMP state objects to teams of
// one or more kernels.
//
// Reference:
// Thomas R.W. Scogland and Wu-chun Feng. 2015.
// Design and Evaluation of Scalable Concurrent Queues for Many-Core
// Architectures. International Conference on Performance Engineering.
//
//===----------------------------------------------------------------------===//
#ifndef __STATE_QUEUE_H
#define __STATE_QUEUE_H
#include <stdint.h>
#include "target_impl.h"
template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
private:
ElementType elements[SIZE];
volatile ElementType *elementQueue[SIZE];
volatile uint32_t head;
volatile uint32_t ids[SIZE];
volatile uint32_t tail;
static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
INLINE uint32_t ENQUEUE_TICKET();
INLINE uint32_t DEQUEUE_TICKET();
INLINE static uint32_t ID(uint32_t ticket);
INLINE bool IsServing(uint32_t slot, uint32_t id);
INLINE void PushElement(uint32_t slot, ElementType *element);
INLINE ElementType *PopElement(uint32_t slot);
INLINE void DoneServing(uint32_t slot, uint32_t id);
public:
INLINE omptarget_nvptx_Queue() {}
INLINE void Enqueue(ElementType *element);
INLINE ElementType *Dequeue();
};
#include "state-queuei.h"
#endif

View File

@ -1,88 +0,0 @@
//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of a queue to hand out OpenMP state
// objects to teams of one or more kernels.
//
// Reference:
// Thomas R.W. Scogland and Wu-chun Feng. 2015.
// Design and Evaluation of Scalable Concurrent Queues for Many-Core
// Architectures. International Conference on Performance Engineering.
//
//===----------------------------------------------------------------------===//
#include "state-queue.h"
template <typename ElementType, uint32_t SIZE>
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
return __kmpc_atomic_add((unsigned int *)&tail, 1u);
}
template <typename ElementType, uint32_t SIZE>
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
return __kmpc_atomic_add((unsigned int *)&head, 1u);
}
template <typename ElementType, uint32_t SIZE>
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
return (ticket / SIZE) * 2;
}
template <typename ElementType, uint32_t SIZE>
INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
uint32_t id) {
return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
}
template <typename ElementType, uint32_t SIZE>
INLINE void
omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
ElementType *element) {
__kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
(unsigned long long)element);
}
template <typename ElementType, uint32_t SIZE>
INLINE ElementType *
omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
return (ElementType *)__kmpc_atomic_add(
(unsigned long long *)&elementQueue[slot], (unsigned long long)0);
}
template <typename ElementType, uint32_t SIZE>
INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
uint32_t id) {
__kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
}
template <typename ElementType, uint32_t SIZE>
INLINE void
omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
uint32_t ticket = ENQUEUE_TICKET();
uint32_t slot = ticket % SIZE;
uint32_t id = ID(ticket) + 1;
while (!IsServing(slot, id))
;
PushElement(slot, element);
DoneServing(slot, id);
}
template <typename ElementType, uint32_t SIZE>
INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
uint32_t ticket = DEQUEUE_TICKET();
uint32_t slot = ticket % SIZE;
uint32_t id = ID(ticket);
while (!IsServing(slot, id))
;
ElementType *element = PopElement(slot);
// This is to populate the queue because of the lack of GPU constructors.
if (element == 0)
element = &elements[slot];
DoneServing(slot, id);
return element;
}

View File

@ -1,91 +0,0 @@
//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Wrapper to some functions natively supported by the GPU.
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_SUPPORT_H
#define OMPTARGET_SUPPORT_H
#include "interface.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
enum OMPTgtExecModeFlags : int8_t {
OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
OMP_TGT_EXEC_MODE_SPMD = 1 << 1
};
enum OMPTgtRuntimeModeFlags : int8_t {
OMP_TGT_RUNTIME_UNINITIALIZED = 0,
OMP_TGT_RUNTIME_INITIALIZED = 1 << 2
};
void setExecutionParameters(OMPTgtExecModeFlags EMode,
OMPTgtRuntimeModeFlags RMode);
bool isGenericMode();
bool isRuntimeUninitialized();
bool isRuntimeInitialized();
////////////////////////////////////////////////////////////////////////////////
// get info from machine
////////////////////////////////////////////////////////////////////////////////
// get global ids to locate tread/team info (constant regardless of OMP)
int GetLogicalThreadIdInBlock();
int GetMasterThreadID();
int GetNumberOfWorkersInTeam();
// get OpenMP thread and team ids
int GetOmpThreadId(); // omp_thread_num
int GetOmpTeamId(); // omp_team_num
// get OpenMP number of threads and team
int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
int GetNumberOfOmpTeams(); // omp_num_teams
// get OpenMP number of procs
int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
// masters
int IsTeamMaster(int ompThreadId);
// Parallel level
void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
////////////////////////////////////////////////////////////////////////////////
// Memory
////////////////////////////////////////////////////////////////////////////////
// safe alloc and free
void *SafeMalloc(size_t size, const char *msg); // check if success
void *SafeFree(void *ptr, const char *msg);
// pad to a alignment (power of 2 only)
unsigned long PadBytes(unsigned long size, unsigned long alignment);
#define ADD_BYTES(_addr, _bytes) \
((void *)((char *)((void *)(_addr)) + (_bytes)))
#define SUB_BYTES(_addr, _bytes) \
((void *)((char *)((void *)(_addr)) - (_bytes)))
////////////////////////////////////////////////////////////////////////////////
// Teams Reduction Scratchpad Helpers
////////////////////////////////////////////////////////////////////////////////
unsigned int *GetTeamsReductionTimestamp();
char *GetTeamsReductionScratchpad();
// Invoke an outlined parallel function unwrapping global, shared arguments (up
// to 128).
void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
void **args, size_t nargs);
#endif

View File

@ -1,505 +0,0 @@
//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains all the definitions that are relevant to
// the interface. The first section contains the interface as
// declared by OpenMP. The second section includes the compiler
// specific interfaces.
//
//===----------------------------------------------------------------------===//
#ifndef _INTERFACES_H_
#define _INTERFACES_H_
#include <stddef.h>
#include <stdint.h>
#ifdef __AMDGCN__
#include "amdgcn/src/amdgcn_interface.h"
#endif
#ifdef __CUDACC__
#include "nvptx/src/nvptx_interface.h"
#endif
////////////////////////////////////////////////////////////////////////////////
// OpenMP interface
////////////////////////////////////////////////////////////////////////////////
typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
typedef enum omp_sched_t {
omp_sched_static = 1, /* chunkSize >0 */
omp_sched_dynamic = 2, /* chunkSize >0 */
omp_sched_guided = 3, /* chunkSize >0 */
omp_sched_auto = 4, /* no chunkSize */
} omp_sched_t;
typedef enum omp_proc_bind_t {
omp_proc_bind_false = 0,
omp_proc_bind_true = 1,
omp_proc_bind_master = 2,
omp_proc_bind_close = 3,
omp_proc_bind_spread = 4
} omp_proc_bind_t;
EXTERN double omp_get_wtick(void);
EXTERN double omp_get_wtime(void);
EXTERN void omp_set_num_threads(int num);
EXTERN int omp_get_num_threads(void);
EXTERN int omp_get_max_threads(void);
EXTERN int omp_get_thread_limit(void);
EXTERN int omp_get_thread_num(void);
EXTERN int omp_get_num_procs(void);
EXTERN int omp_in_parallel(void);
EXTERN int omp_in_final(void);
EXTERN void omp_set_dynamic(int flag);
EXTERN int omp_get_dynamic(void);
EXTERN void omp_set_nested(int flag);
EXTERN int omp_get_nested(void);
EXTERN void omp_set_max_active_levels(int level);
EXTERN int omp_get_max_active_levels(void);
EXTERN int omp_get_level(void);
EXTERN int omp_get_active_level(void);
EXTERN int omp_get_ancestor_thread_num(int level);
EXTERN int omp_get_team_size(int level);
EXTERN void omp_init_lock(omp_lock_t *lock);
EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
EXTERN void omp_destroy_lock(omp_lock_t *lock);
EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
EXTERN void omp_set_lock(omp_lock_t *lock);
EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
EXTERN void omp_unset_lock(omp_lock_t *lock);
EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
EXTERN int omp_test_lock(omp_lock_t *lock);
EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
EXTERN omp_proc_bind_t omp_get_proc_bind(void);
EXTERN int omp_get_cancellation(void);
EXTERN void omp_set_default_device(int deviceId);
EXTERN int omp_get_default_device(void);
EXTERN int omp_get_num_devices(void);
EXTERN int omp_get_num_teams(void);
EXTERN int omp_get_team_num(void);
EXTERN int omp_get_initial_device(void);
EXTERN int omp_get_max_task_priority(void);
EXTERN void *llvm_omp_get_dynamic_shared();
////////////////////////////////////////////////////////////////////////////////
// file below is swiped from kmpc host interface
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// kmp specific types
////////////////////////////////////////////////////////////////////////////////
typedef enum kmp_sched_t {
kmp_sched_static_chunk = 33,
kmp_sched_static_nochunk = 34,
kmp_sched_dynamic = 35,
kmp_sched_guided = 36,
kmp_sched_runtime = 37,
kmp_sched_auto = 38,
kmp_sched_static_balanced_chunk = 45,
kmp_sched_static_ordered = 65,
kmp_sched_static_nochunk_ordered = 66,
kmp_sched_dynamic_ordered = 67,
kmp_sched_guided_ordered = 68,
kmp_sched_runtime_ordered = 69,
kmp_sched_auto_ordered = 70,
kmp_sched_distr_static_chunk = 91,
kmp_sched_distr_static_nochunk = 92,
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
kmp_sched_default = kmp_sched_static_nochunk,
kmp_sched_unordered_first = kmp_sched_static_chunk,
kmp_sched_unordered_last = kmp_sched_auto,
kmp_sched_ordered_first = kmp_sched_static_ordered,
kmp_sched_ordered_last = kmp_sched_auto_ordered,
kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
kmp_sched_distribute_last =
kmp_sched_distr_static_chunk_sched_static_chunkone,
/* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
* Since we need to distinguish the three possible cases (no modifier,
* monotonic modifier, nonmonotonic modifier), we need separate bits for
* each modifier. The absence of monotonic does not imply nonmonotonic,
* especially since 4.5 says that the behaviour of the "no modifier" case
* is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
*
* Since we're passing a full 32 bit value, we can use a couple of high
* bits for these flags; out of paranoia we avoid the sign bit.
*
* These modifiers can be or-ed into non-static schedules by the compiler
* to pass the additional information. They will be stripped early in the
* processing in __kmp_dispatch_init when setting up schedules, so
* most of the code won't ever see schedules with these bits set.
*/
kmp_sched_modifier_monotonic = (1 << 29),
/**< Set if the monotonic schedule modifier was present */
kmp_sched_modifier_nonmonotonic = (1 << 30),
/**< Set if the nonmonotonic schedule modifier was present */
#define SCHEDULE_WITHOUT_MODIFIERS(s) \
(enum kmp_sched_t)( \
(s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
#define SCHEDULE_HAS_NONMONOTONIC(s) \
(((s)&kmp_sched_modifier_nonmonotonic) != 0)
#define SCHEDULE_HAS_NO_MODIFIERS(s) \
(((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
0)
} kmp_sched_t;
/*!
* Enum for accesseing the reserved_2 field of the ident_t struct below.
*/
enum {
/*! Bit set to 1 when in SPMD mode. */
KMP_IDENT_SPMD_MODE = 0x01,
/*! Bit set to 1 when a simplified runtime is used. */
KMP_IDENT_SIMPLE_RT_MODE = 0x02,
};
/*!
* The ident structure that describes a source location.
* The struct is identical to the one in the kmp.h file.
* We maintain the same data structure for compatibility.
*/
typedef short kmp_int16;
typedef int kmp_int32;
typedef struct ident {
kmp_int32 reserved_1; /**< might be used in Fortran; see above */
kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
identifies this union member */
kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */
kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */
char const *psource; /**< String describing the source location.
The string is composed of semi-colon separated fields
which describe the source file, the function and a pair
of line numbers that delimit the construct. */
} ident_t;
// parallel defs
typedef ident_t kmp_Ident;
typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
int16_t lane_offset,
int16_t shortCircuit);
typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
// task defs
typedef struct kmp_TaskDescr kmp_TaskDescr;
typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
typedef struct kmp_TaskDescr {
void *sharedPointerTable; // ptr to a table of shared var ptrs
kmp_TaskFctPtr sub; // task subroutine
int32_t partId; // unused
kmp_TaskFctPtr destructors; // destructor of c++ first private
} kmp_TaskDescr;
// sync defs
typedef int32_t kmp_CriticalName[8];
////////////////////////////////////////////////////////////////////////////////
// external interface
////////////////////////////////////////////////////////////////////////////////
// parallel
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
NOINLINE EXTERN uint8_t __kmpc_parallel_level();
// proc bind
EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid,
int proc_bind);
EXTERN int omp_get_num_places(void);
EXTERN int omp_get_place_num_procs(int place_num);
EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
EXTERN int omp_get_place_num(void);
EXTERN int omp_get_partition_num_places(void);
EXTERN void omp_get_partition_place_nums(int *place_nums);
// for static (no chunk or chunk)
EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk);
EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter1,
uint64_t *plower, uint64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk);
// distribute static (no chunk or chunk)
EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk);
EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter1,
uint64_t *plower, uint64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk);
EXTERN
void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN
void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN
void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk);
EXTERN
void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter1,
uint64_t *plower, uint64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk);
EXTERN
void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int32_t *plower, int32_t *pupper,
int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN
void __kmpc_for_static_init_4u_simple_generic(
kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
int32_t chunk);
EXTERN
void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t *plastiter,
int64_t *plower, int64_t *pupper,
int64_t *pstride, int64_t incr,
int64_t chunk);
EXTERN
void __kmpc_for_static_init_8u_simple_generic(
kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
int64_t chunk);
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid);
// for dynamic
EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int32_t lower, int32_t upper,
int32_t incr, int32_t chunk);
EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid,
int32_t sched, uint32_t lower,
uint32_t upper, int32_t incr,
int32_t chunk);
EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid,
int32_t sched, int64_t lower, int64_t upper,
int64_t incr, int64_t chunk);
EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid,
int32_t sched, uint64_t lower,
uint64_t upper, int64_t incr,
int64_t chunk);
EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid,
int32_t *plastiter, int32_t *plower,
int32_t *pupper, int32_t *pstride);
EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid,
int32_t *plastiter, uint32_t *plower,
uint32_t *pupper, int32_t *pstride);
EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid,
int32_t *plastiter, int64_t *plower,
int64_t *pupper, int64_t *pstride);
EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid,
int32_t *plastiter, uint64_t *plower,
uint64_t *pupper, int64_t *pstride);
EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
// reduction
EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
kmp_ListGlobalFctPtr glredFct);
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
// sync barrier
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
// single
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid);
// sync
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
kmp_CriticalName *crit);
EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
kmp_CriticalName *crit);
EXTERN void __kmpc_flush(kmp_Ident *loc);
// vote
EXTERN uint64_t __kmpc_warp_active_thread_mask(void);
// syncwarp
EXTERN void __kmpc_syncwarp(uint64_t);
// tasks
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid,
int32_t flag,
size_t sizeOfTaskInclPrivate,
size_t sizeOfSharedTable,
kmp_TaskFctPtr sub);
EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newLegacyTaskDescr);
EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newLegacyTaskDescr,
int32_t depNum, void *depList,
int32_t noAliasDepNum,
void *noAliasDepList);
EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newLegacyTaskDescr);
EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newLegacyTaskDescr);
EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
int32_t depNum, void *depList,
int32_t noAliasDepNum, void *noAliasDepList);
EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid);
EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid);
EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
int end_part);
EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid);
EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr, int if_val,
uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
int32_t sched, uint64_t grainsize, void *task_dup);
// cancel
EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal);
EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal);
// non standard
EXTERN int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
bool UseGenericStateMachine,
bool RequiresFullRuntime);
EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
bool RequiresFullRuntime);
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
int32_t NumThreadsClause);
EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
EXTERN void __kmpc_kernel_end_parallel();
EXTERN void __kmpc_data_sharing_init_stack();
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
EXTERN void __kmpc_end_sharing_variables();
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
/// Entry point to start a new parallel region.
///
/// \param ident The source identifier.
/// \param global_tid The global thread ID.
/// \param if_expr The if(expr), or 1 if none given.
/// \param num_threads The num_threads(expr), or -1 if none given.
/// \param proc_bind The proc_bind, or `proc_bind_default` if none given.
/// \param fn The outlined parallel region function.
/// \param wrapper_fn The worker wrapper function of fn.
/// \param args The pointer array of arguments to fn.
/// \param nargs The number of arguments to fn.
NOINLINE EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
kmp_int32 if_expr,
kmp_int32 num_threads, int proc_bind,
void *fn, void *wrapper_fn, void **args,
size_t nargs);
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();
/// Return true if the hardware thread id \p Tid represents the OpenMP main
/// thread in generic mode outside of a parallel region.
EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);
/// Return true if the hardware thread id \p Tid represents the OpenMP main
/// thread in generic mode.
EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid);
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
const void *buf, size_t size,
int16_t is_shared, const void **res);
EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
int16_t is_shared);
/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
/// called by any thread, allocation happens per-thread.
EXTERN void *__kmpc_alloc_shared(uint64_t Bytes);
/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
/// a stack (push/pop). Can be called by any thread. \p Ptr must be allocated by
/// __kmpc_alloc_shared by the same thread. \p Bytes contains the size of the
/// paired allocation to make memory management easier.
EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes);
/// Get a pointer to the dynamic shared memory buffer in the device.
EXTERN void *__kmpc_get_dynamic_shared();
#endif

View File

@ -1,257 +0,0 @@
##===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##
#
# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
#
##===----------------------------------------------------------------------===##
# By default we will build NVPTX deviceRTL on a CUDA free system
set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
"Whether build NVPTX deviceRTL on CUDA free system.")
if (NOT LIBOMPTARGET_BUILD_NVPTX_BCLIB)
libomptarget_say("Not building NVPTX deviceRTL: Disabled by LIBOMPTARGET_BUILD_NVPTX_BCLIB")
return()
endif()
if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
return()
endif()
# Check if we can create an LLVM bitcode implementation of the runtime library
# that could be inlined in the user application. For that we need to find
# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
# an LLVM linker.
set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
"Location of a CUDA compiler capable of emitting LLVM bitcode.")
set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
"Location of a linker capable of linking LLVM bitcode objects.")
if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING)
# Compile the deviceRTL with the clang that is built in the project.
set(cuda_compiler "$<TARGET_FILE:clang>")
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
# Compile the device runtime with the compiler that OpenMP is built with.
# This is the case with LLVM_ENABLE_RUNTIMES=openmp.
# FIXME: This is unreliable; the compiler can be on older version of clang
# that does not support compiling CUDA, or only an older version of it. The
# risk is especially high on sytems where clang is the default compiler
# (MacOS, BSDs). LLVM_ENABLE_RUNTIMES=openmp should itself set
# LIBOMPTARGET_NVPTX_CUDA_COMPILER instead.
set(cuda_compiler ${CMAKE_C_COMPILER})
else()
libomptarget_say("Not building NVPTX deviceRTL: clang not found")
return()
endif()
# Get compiler directory to try to locate a suitable linker.
get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
set(llvm_link "${compiler_dir}/llvm-link")
if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
elseif (EXISTS ${llvm_link})
# Try to use the linker consistent with the CUDA compiler unless explicitly
# set to a different linker.
set(bc_linker ${llvm_link})
elseif (NOT OPENMP_STANDALONE_BUILD AND NOT CMAKE_CROSSCOMPILING)
# Use the linker also built in the same project.
set(bc_linker "$<TARGET_FILE:llvm-link>")
else()
libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
return()
endif()
# TODO: This part needs to be refined when libomptarget is going to support
# Windows!
# TODO: This part can also be removed if we can change the clang driver to make
# it support device only compilation.
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
set(aux_triple x86_64-unknown-linux-gnu)
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
set(aux_triple powerpc64le-unknown-linux-gnu)
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
set(aux_triple aarch64-unknown-linux-gnu)
else()
libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
return()
endif()
get_filename_component(devicertl_base_directory
${CMAKE_CURRENT_SOURCE_DIR}
DIRECTORY)
set(devicertl_common_directory
${devicertl_base_directory}/common)
set(devicertl_nvptx_directory
${devicertl_base_directory}/nvptx)
set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80 86)
set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
"List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
set(nvptx_sm_list ${all_capabilities})
elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
endif()
set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
else()
string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
endif()
# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
# build.
if (NOT nvptx_sm_list)
libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
return()
endif()
# Check all SM values
foreach(sm ${nvptx_sm_list})
if (NOT ${sm} IN_LIST all_capabilities)
libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
endif()
endforeach()
# Override default MAX_SM in src/target_impl.h if requested
if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
endif()
# Activate RTL message dumps if requested by the user.
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
"Activate NVPTX device RTL debug messages.")
if ("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using in-tree clang.")
else ()
libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using ${cuda_compiler}")
endif ()
set(cuda_src_files
${devicertl_common_directory}/src/cancel.cu
${devicertl_common_directory}/src/critical.cu
${devicertl_common_directory}/src/data_sharing.cu
${devicertl_common_directory}/src/libcall.cu
${devicertl_common_directory}/src/loop.cu
${devicertl_common_directory}/src/omp_data.cu
${devicertl_common_directory}/src/omptarget.cu
${devicertl_common_directory}/src/parallel.cu
${devicertl_common_directory}/src/reduction.cu
${devicertl_common_directory}/src/support.cu
${devicertl_common_directory}/src/sync.cu
${devicertl_common_directory}/src/task.cu
${devicertl_common_directory}/src/shuffle.cpp
src/target_impl.cu
)
# Prepend -I to each list element
set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I")
# Set flags for LLVM Bitcode compilation.
set(bc_flags -S -x c++ -O1 -std=c++14
-mllvm -openmp-opt-disable
-ffreestanding
-target nvptx64
-fvisibility=hidden
-Xclang -emit-llvm-bc
-Xclang -aux-triple -Xclang ${aux_triple}
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-Xclang -target-feature -Xclang +ptx61
-D__CUDACC__
-I${devicertl_base_directory}
-I${devicertl_common_directory}/include
-I${devicertl_nvptx_directory}/src
-I${devicertl_base_directory}/../include
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX})
if(${LIBOMPTARGET_NVPTX_DEBUG})
list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g)
else()
list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
endif()
# Create target to build all Bitcode libraries.
add_custom_target(omptarget-nvptx-bc)
# Generate a Bitcode library for all the compute capabilities the user requested
foreach(sm ${nvptx_sm_list})
set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
set(bc_files "")
foreach(src ${cuda_src_files})
get_filename_component(infile ${src} ABSOLUTE)
get_filename_component(outfile ${src} NAME)
set(outfile "${outfile}-sm_${sm}.bc")
add_custom_command(OUTPUT ${outfile}
COMMAND ${cuda_compiler} ${bc_flags}
${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
DEPENDS ${infile}
IMPLICIT_DEPENDS CXX ${infile}
COMMENT "Building LLVM bitcode ${outfile}"
VERBATIM
)
if("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
# Add a file-level dependency to ensure that clang is up-to-date.
# By default, add_custom_command only builds clang if the
# executable is missing.
add_custom_command(OUTPUT ${outfile}
DEPENDS clang
APPEND
)
endif()
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
list(APPEND bc_files ${outfile})
endforeach()
set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
COMMAND ${bc_linker}
-o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
DEPENDS ${bc_files}
COMMENT "Linking LLVM bitcode ${bclib_name}"
)
if("${bc_linker}" STREQUAL "$<TARGET_FILE:llvm-link>")
# Add a file-level dependency to ensure that llvm-link is up-to-date.
# By default, add_custom_command only builds llvm-link if the
# executable is missing.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
DEPENDS llvm-link
APPEND
)
endif()
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
# Copy library to destination.
add_custom_command(TARGET ${bclib_target_name} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
${LIBOMPTARGET_LIBRARY_DIR})
# Install bitcode library under the lib destination folder.
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
endforeach()
# Test will be enabled if the building machine supports CUDA
if (LIBOMPTARGET_DEP_CUDA_FOUND)
add_subdirectory(test)
endif()

View File

@ -1,523 +0,0 @@
**Design document for OpenMP reductions on the GPU**
//Abstract: //In this document we summarize the new design for an OpenMP
implementation of reductions on NVIDIA GPUs. This document comprises
* a succinct background review,
* an introduction to the decoupling of reduction algorithm and
data-structure-specific processing routines,
* detailed illustrations of reduction algorithms used and
* a brief overview of steps we have made beyond the last implementation.
**Problem Review**
Consider a typical OpenMP program with reduction pragma.
```
double foo, bar;
#pragma omp parallel for reduction(+:foo, bar)
for (int i = 0; i < N; i++) {
foo+=A[i]; bar+=B[i];
}
```
where 'foo' and 'bar' are reduced across all threads in the parallel region.
Our primary goal is to efficiently aggregate the values of foo and bar in
such manner that
* makes the compiler logically concise.
* efficiently reduces within warps, threads, blocks and the device.
**Introduction to Decoupling**
In this section we address the problem of making the compiler
//logically concise// by partitioning the task of reduction into two broad
categories: data-structure specific routines and algorithmic routines.
The previous reduction implementation was highly coupled with
the specificity of the reduction element data structures (e.g., sizes, data
types) and operators of the reduction (e.g., addition, multiplication). In
our implementation we strive to decouple them. In our final implementations,
we could remove all template functions in our runtime system.
The (simplified) pseudo code generated by LLVM is as follows:
```
1. Create private copies of variables: foo_p, bar_p
2. Each thread reduces the chunk of A and B assigned to it and writes
to foo_p and bar_p respectively.
3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn,
interWarpCpyFn)
where:
struct ReduceData {
double *foo;
double *bar;
} reduceData
reduceData.foo = &foo_p
reduceData.bar = &bar_p
shuffleReduceFn and interWarpCpyFn are two auxiliary functions
generated to aid the runtime performing algorithmic steps
while being data-structure agnostic about ReduceData.
In particular, shuffleReduceFn is a function that takes the following
inputs:
a. local copy of ReduceData
b. its lane_id
c. the offset of the lane_id which hosts a remote ReduceData
relative to the current one
d. an algorithm version parameter determining which reduction
algorithm to use.
This shuffleReduceFn retrieves the remote ReduceData through shuffle
intrinsics and reduces, using the algorithm specified by the 4th
parameter, the local ReduceData and with the remote ReduceData element
wise, and places the resultant values into the local ReduceData.
Different reduction algorithms are implemented with different runtime
functions, but they all make calls to this same shuffleReduceFn to
perform the essential reduction step. Therefore, based on the 4th
parameter, this shuffleReduceFn will behave slightly differently to
cooperate with the runtime function to ensure correctness under
different circumstances.
InterWarpCpyFn, as the name suggests, is a function that copies data
across warps. Its function is to tunnel all the thread private
ReduceData that is already reduced within a warp to a lane in the first
warp with minimal shared memory footprint. This is an essential step to
prepare for the last step of a block reduction.
(Warp, block, device level reduction routines that utilize these
auxiliary functions will be discussed in the next section.)
4. if ret == 1:
The master thread stores the reduced result in the globals.
foo += reduceData.foo; bar += reduceData.bar
```
**Reduction Algorithms**
On the warp level, we have three versions of the algorithms:
1. Full Warp Reduction
```
gpu_regular_warp_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr ShuffleReduceFn) {
for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
ShuffleReduceFn(reduce_data, 0, offset, 0);
}
```
ShuffleReduceFn is used here with lane_id set to 0 because it is not used
therefore we save instructions by not retrieving lane_id from the corresponding
special registers. The 4th parameters, which represents the version of the
algorithm being used here, is set to 0 to signify full warp reduction.
In this version specified (=0), the ShuffleReduceFn behaves, per element, as
follows:
```
//reduce_elem refers to an element in the local ReduceData
//remote_elem is retrieved from a remote lane
remote_elem = shuffle_down(reduce_elem, offset, 32);
reduce_elem = reduce_elem @ remote_elem;
```
An illustration of this algorithm operating on a hypothetical 8-lane full-warp
would be:
{F74}
The coloring invariant follows that elements with the same color will be
combined and reduced in the next reduction step. As can be observed, no overhead
is present, exactly log(2, N) steps are needed.
2. Contiguous Full Warp Reduction
```
gpu_irregular_warp_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
int lane_id) {
int curr_size;
int offset;
curr_size = size;
mask = curr_size/2;
while (offset>0) {
ShuffleReduceFn(reduce_data, lane_id, offset, 1);
curr_size = (curr_size+1)/2;
offset = curr_size/2;
}
}
```
In this version specified (=1), the ShuffleReduceFn behaves, per element, as
follows:
```
//reduce_elem refers to an element in the local ReduceData
//remote_elem is retrieved from a remote lane
remote_elem = shuffle_down(reduce_elem, offset, 32);
if (lane_id < offset) {
reduce_elem = reduce_elem @ remote_elem
} else {
reduce_elem = remote_elem
}
```
An important invariant (also a restriction on the starting state of the
reduction) is that this algorithm assumes that all unused ReduceData are
located in a contiguous subset of threads in a warp starting from lane 0.
With the presence of a trailing active lane with an odd-numbered lane
id, its value will not be aggregated with any other lane. Therefore,
in order to preserve the invariant, such ReduceData is copied to the first lane
whose thread-local ReduceData has already being used in a previous reduction
and would therefore be useless otherwise.
An illustration of this algorithm operating on a hypothetical 8-lane partial
warp woud be:
{F75}
As illustrated, this version of the algorithm introduces overhead whenever
we have odd number of participating lanes in any reduction step to
copy data between lanes.
3. Dispersed Partial Warp Reduction
```
gpu_irregular_simt_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr ShuffleReduceFn) {
int size, remote_id;
int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
do {
remote_id = find_the_next_active_lane_id_right_after_me();
// the above function returns 0 of no active lane
// is present right after the current thread.
size = get_number_of_active_lanes_in_this_warp();
logical_lane_id /= 2;
ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
} while (logical_lane_id % 2 == 0 && size > 1);
```
There is no assumption made about the initial state of the reduction.
Any number of lanes (>=1) could be active at any position. The reduction
result is kept in the first active lane.
In this version specified (=2), the ShuffleReduceFn behaves, per element, as
follows:
```
//reduce_elem refers to an element in the local ReduceData
//remote_elem is retrieved from a remote lane
remote_elem = shuffle_down(reduce_elem, offset, 32);
if (LaneId % 2 == 0 && Offset > 0) {
reduce_elem = reduce_elem @ remote_elem
} else {
reduce_elem = remote_elem
}
```
We will proceed with a brief explanation for some arguments passed in,
it is important to notice that, in this section, we will introduce the
concept of logical_lane_id, and it is important to distinguish it
from physical lane_id as defined by nvidia.
1. //logical_lane_id//: as the name suggests, it refers to the calculated
lane_id (instead of the physical one defined by nvidia) that would make
our algorithm logically concise. A thread with logical_lane_id k means
there are (k-1) threads before it.
2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
id of the remote lane from which we will retrieve the ReduceData. We
subtract (threadIdx+1) from it because we would like to maintain only one
underlying shuffle intrinsic (which is used to communicate among lanes in a
warp). This particular version of shuffle intrinsic we take accepts only
offsets, instead of absolute lane_id. Therefore the subtraction is performed
on the absolute lane_id we calculated to obtain the offset.
This algorithm is slightly different in 2 ways and it is not, conceptually, a
generalization of the above algorithms.
1. It reduces elements close to each other. For instance, values in the 0th lane
is to be combined with that of the 1st lane; values in the 2nd lane is to be
combined with that of the 3rd lane. We did not use the previous algorithm
where the first half of the (partial) warp is reduced with the second half
of the (partial) warp. This is because, the mapping
f(x): logical_lane_id -> physical_lane_id;
can be easily calculated whereas its inverse
f^-1(x): physical_lane_id -> logical_lane_id
cannot and performing such reduction requires the inverse to be known.
2. Because this algorithm is agnostic about the positions of the lanes that are
active, we do not need to perform the coping step as in the second
algorithm.
An illustrative run would look like
{F76}
As observed, overhead is high because in each and every step of reduction,
logical_lane_id is recalculated; so is the remote_id.
On a block level, we have implemented the following block reduce algorithm:
```
gpu_irregular_block_reduce(void *reduce_data,
kmp_ShuffleReductFctPtr shuflReduceFn,
kmp_InterWarpCopyFctPtr interWarpCpyFn,
int size) {
int wid = threadIdx.x/WARPSIZE;
int lane_id = threadIdx.x%WARPSIZE;
int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
unsigned tnum = __ballot(1);
int thread_num = __popc(tnum);
//full warp reduction
if (thread_num == WARPSIZE) {
gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
}
//partial warp reduction
if (thread_num < WARPSIZE) {
gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
lane_id);
}
//Gather all the reduced values from each warp
//to the first warp
//named_barrier inside this function to ensure
//correctness. It is effectively a sync_thread
//that won't deadlock.
interWarpCpyFn(reduce_data, warp_needed);
//This is to reduce data gathered from each "warp master".
if (wid==0) {
gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
lane_id);
}
return;
}
```
In this function, no ShuffleReduceFn is directly called as it makes calls
to various versions of the warp-reduction functions. It first reduces
ReduceData warp by warp; in the end, we end up with the number of
ReduceData equal to the number of warps present in this thread
block. We then proceed to gather all such ReduceData to the first warp.
As observed, in this algorithm we make use of the function InterWarpCpyFn,
which copies data from each of the "warp master" (0th lane of each warp, where
a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
mathematical sense) the problem of reduction across warp masters in a block to
the problem of warp reduction which we already have solutions to.
We can thus completely avoid the use of atomics to reduce in a threadblock.
**Efficient Cross Block Reduce**
The next challenge is to reduce values across threadblocks. We aim to do this
without atomics or critical sections.
Let a kernel be started with TB threadblocks.
Let the GPU have S SMs.
There can be at most N active threadblocks per SM at any time.
Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of
at most 'N' active threadblocks on SM s. Let each threadblock active on an SM
be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id)
uniquely identifies an active threadblock on the GPU.
To efficiently implement cross block reduce, we first allocate an array for
each value to be reduced of size S*N (which is the maximum number of active
threadblocks at any time on the device).
Each threadblock reduces its value to slot [s][id]. This can be done without
locking since no other threadblock can write to the same slot concurrently.
As a final stage, we reduce the values in the array as follows:
```
// Compiler generated wrapper function for each target region with a reduction
clause.
target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1
thread.
// Use dynamic parallelism to launch M teams, N threads as requested by the
user to execute the target region.
target_function<<M, N>>(map_args)
Reduce values in reduction_array
```
**Comparison with Last Version**
The (simplified) pseudo code generated by LLVM on the host is as follows:
```
1. Create private copies of variables: foo_p, bar_p
2. Each thread reduces the chunk of A and B assigned to it and writes
to foo_p and bar_p respectively.
3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
where:
struct ReduceData {
double *foo;
double *bar;
} reduceData
reduceData.foo = &foo_p
reduceData.bar = &bar_p
reduceFn is a pointer to a function that takes in two inputs
of type ReduceData, "reduces" them element wise, and places the
result in the first input:
reduceFn(ReduceData *a, ReduceData *b)
a = a @ b
Every thread in the parallel region calls kmpc_reduce_nowait with
its private copy of reduceData. The runtime reduces across the
threads (using tree reduction on the operator 'reduceFn?) and stores
the final result in the master thread if successful.
4. if ret == 1:
The master thread stores the reduced result in the globals.
foo += reduceData.foo; bar += reduceData.bar
5. else if ret == 2:
In this case kmpc_reduce_nowait() could not use tree reduction,
so use atomics instead:
each thread atomically writes to foo
each thread atomically writes to bar
```
On a GPU, a similar reduction may need to be performed across SIMT threads,
warps, and threadblocks. The challenge is to do so efficiently in a fashion
that is compatible with the LLVM OpenMP implementation.
In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
the salient steps of the code generated are as follows:
```
1. Create private copies of variables: foo_p, bar_p
2. Each thread reduces the chunk of A and B assigned to it and writes
to foo_p and bar_p respectively.
3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
status = can_block_reduce()
if status == 1:
reduce efficiently to thread 0 using shuffles and shared memory.
return 1
else
cannot use efficient block reduction, fallback to atomics
return 2
4. if ret == 1:
The master thread stores the reduced result in the globals.
foo += reduceData.foo; bar += reduceData.bar
5. else if ret == 2:
In this case kmpc_reduce_nowait() could not use tree reduction,
so use atomics instead:
each thread atomically writes to foo
each thread atomically writes to bar
```
The function can_block_reduce() is defined as follows:
```
int32_t can_block_reduce() {
int tid = GetThreadIdInTeam();
int nt = GetNumberOfOmpThreads(tid);
if (nt != blockDim.x)
return 0;
unsigned tnum = __ballot(1);
if (tnum != (~0x0)) {
return 0;
}
return 1;
}
```
This function permits the use of the efficient block reduction algorithm
using shuffles and shared memory (return 1) only if (a) all SIMT threads in
a warp are active (i.e., number of threads in the parallel region is a
multiple of 32) and (b) the number of threads in the parallel region
(set by the num_threads clause) equals blockDim.x.
If either of these preconditions is not true, each thread in the threadblock
updates the global value using atomics.
Atomics and compare-and-swap operations are expensive on many threaded
architectures such as GPUs and we must avoid them completely.
**Appendix: Implementation Details**
```
// Compiler generated function.
reduceFn(ReduceData *a, ReduceData *b)
a->foo = a->foo + b->foo
a->bar = a->bar + b->bar
// Compiler generated function.
swapAndReduceFn(ReduceData *thread_private, int lane)
ReduceData *remote = new ReduceData()
remote->foo = shuffle_double(thread_private->foo, lane)
remote->bar = shuffle_double(thread_private->bar, lane)
reduceFn(thread_private, remote)
// OMP runtime function.
warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
offset = 16
while (offset > 0)
swapAndReduceFn(thread_private, offset)
offset /= 2
// OMP runtime function.
warpReduce_irregular():
...
// OMP runtime function.
kmpc_reduce_warp(reduceData, swapAndReduceFn)
if all_lanes_active:
warpReduce_regular(reduceData, swapAndReduceFn)
else:
warpReduce_irregular(reduceData, swapAndReduceFn)
if in_simd_region:
// all done, reduce to global in simd lane 0
return 1
else if in_parallel_region:
// done reducing to one value per warp, now reduce across warps
return 3
// OMP runtime function; one for each basic type.
kmpc_reduce_block_double(double *a)
if lane == 0:
shared[wid] = *a
named_barrier(1, num_threads)
if wid == 0
block_reduce(shared)
if lane == 0
*a = shared[0]
named_barrier(1, num_threads)
if wid == 0 and lane == 0
return 1 // write back reduced result
else
return 0 // don't do anything
```
```
// Compiler generated code.
1. Create private copies of variables: foo_p, bar_p
2. Each thread reduces the chunk of A and B assigned to it and writes
to foo_p and bar_p respectively.
3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
4. if ret == 1:
The master thread stores the reduced result in the globals.
foo += reduceData.foo; bar += reduceData.bar
5. else if ret == 3:
ret = block_reduce_double(reduceData.foo)
if ret == 1:
foo += reduceData.foo
ret = block_reduce_double(reduceData.bar)
if ret == 1:
bar += reduceData.bar
```
**Notes**
1. This scheme requires that the CUDA OMP runtime can call llvm generated
functions. This functionality now works.
2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
(including calls through function pointers) are optimized away.
3. If we are reducing multiple to multiple variables in a parallel region,
the reduce operations are all performed in warpReduce_[ir]regular(). This
results in more instructions in the loop and should result in fewer
stalls due to data dependencies. Unfortunately we cannot do the same in
kmpc_reduce_block_double() without increasing shared memory usage.

View File

@ -1,17 +0,0 @@
//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _NVPTX_INTERFACE_H_
#define _NVPTX_INTERFACE_H_
#include <stdint.h>
#define EXTERN extern "C"
typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
#endif

View File

@ -1,198 +0,0 @@
//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Definitions of target specific functions
//
//===----------------------------------------------------------------------===//
#pragma omp declare target
#include "common/debug.h"
#include "target_impl.h"
#include "target_interface.h"
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
}
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
uint64_t val;
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
return val;
}
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
__kmpc_impl_lanemask_t res;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
return res;
}
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
__kmpc_impl_lanemask_t res;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
return res;
}
EXTERN uint32_t __kmpc_impl_smid() {
uint32_t id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
return id;
}
EXTERN double __kmpc_impl_get_wtick() {
// Timer precision is 1ns
return ((double)1E-9);
}
EXTERN double __kmpc_impl_get_wtime() {
unsigned long long nsecs;
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
return (double)nsecs * __kmpc_impl_get_wtick();
}
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
unsigned int Mask;
asm volatile("activemask.b32 %0;" : "=r"(Mask));
return Mask;
}
EXTERN void __kmpc_impl_syncthreads() {
int barrier = 2;
asm volatile("barrier.sync %0;"
:
: "r"(barrier)
: "memory");
}
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
__nvvm_bar_warp_sync(Mask);
}
// NVPTX specific kernel initialization
EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
}
// Barrier until num_threads arrive.
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
int barrier = 1;
asm volatile("barrier.sync %0, %1;"
:
: "r"(barrier), "r"(num_threads)
: "memory");
}
EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
// Calls to the NVPTX layer (assuming 1D layout)
EXTERN int __kmpc_get_hardware_thread_id_in_block() {
return __nvvm_read_ptx_sreg_tid_x();
}
EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
EXTERN int __kmpc_get_hardware_num_blocks() {
return __nvvm_read_ptx_sreg_nctaid_x();
}
EXTERN int __kmpc_get_hardware_num_threads_in_block() {
return __nvvm_read_ptx_sreg_ntid_x();
}
EXTERN unsigned __kmpc_get_warp_size() { return WARPSIZE; }
EXTERN unsigned GetWarpId() {
return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
}
EXTERN unsigned GetLaneId() {
return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
}
// Atomics
uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
}
uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
return __nvvm_atom_inc_gen_ui(Address, Val);
}
uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
}
uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
uint32_t R;
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
return R;
}
uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
(void)__atomic_compare_exchange(Address, &Compare, &Val, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
return Compare;
}
unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
unsigned long long Val) {
unsigned long long R;
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
return R;
}
unsigned long long __kmpc_atomic_add(unsigned long long *Address,
unsigned long long Val) {
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
}
#define __OMP_SPIN 1000
#define UNSET 0u
#define SET 1u
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
__kmpc_impl_unset_lock(lock);
}
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
__kmpc_impl_unset_lock(lock);
}
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
// TODO: not sure spinning is a good idea here..
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
int32_t start = __nvvm_read_ptx_sreg_clock();
int32_t now;
for (;;) {
now = __nvvm_read_ptx_sreg_clock();
int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
break;
}
}
} // wait for 0 to be the read value
}
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
(void)__kmpc_atomic_exchange(lock, UNSET);
}
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
return __kmpc_atomic_add(lock, 0u);
}
extern "C" {
void *malloc(size_t);
void free(void *);
int32_t vprintf(const char *, void *);
}
EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
EXTERN void __kmpc_impl_free(void *x) { free(x); }
EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
uint32_t) {
return vprintf(Format, Arguments);
}
#pragma omp end declare target

View File

@ -1,89 +0,0 @@
//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Definitions of target specific functions
//
//===----------------------------------------------------------------------===//
#ifndef _TARGET_IMPL_H_
#define _TARGET_IMPL_H_
#include "nvptx_interface.h"
#include <stddef.h>
#include <stdint.h>
// subset of inttypes.h
#define PRId64 "ld"
#define PRIu64 "lu"
typedef uint32_t __kmpc_impl_lanemask_t;
#define INLINE inline __attribute__((always_inline))
#define NOINLINE __attribute__((noinline))
#define ALIGN(N) __attribute__((aligned(N)))
#define PLUGIN_ACCESSIBLE /* no annotation needed for cuda plugin */
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
INLINE constexpr const llvm::omp::GV &getGridValue() {
return llvm::omp::NVPTXGridValues;
}
////////////////////////////////////////////////////////////////////////////////
// Kernel options
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// The following def must match the absolute limit hardwired in the host RTL
// max number of threads per team
enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
enum { WARPSIZE = getGridValue().GV_Warp_Size };
// Maximum number of omp state objects per SM allocated statically in global
// memory.
#if __CUDA_ARCH__ >= 600
#define OMP_STATE_COUNT 32
#else
#define OMP_STATE_COUNT 16
#endif
#if !defined(MAX_SM)
#if __CUDA_ARCH__ >= 900
#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
#elif __CUDA_ARCH__ >= 800
// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
// GA102 design has a maxinum of 84 SMs
#define MAX_SM 108
#elif __CUDA_ARCH__ >= 700
#define MAX_SM 84
#elif __CUDA_ARCH__ >= 600
#define MAX_SM 56
#else
#define MAX_SM 16
#endif
#endif
#define OMP_ACTIVE_PARALLEL_LEVEL 128
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The size reserved for data in a shared memory slot.
DS_Slot_Size = getGridValue().GV_Slot_Size,
// The slot size that should be reserved for a working warp.
DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
// The maximum number of warps in use
DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
};
enum : __kmpc_impl_lanemask_t {
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
};
#define printf(...)
#endif

View File

@ -1,25 +0,0 @@
if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
# Silently return, no need to annoy the user.
return()
endif()
set(deps omptarget omp)
if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
set(deps ${deps} omptarget-nvptx-bc)
endif()
# Run with only one thread to only launch one application to the GPU at a time.
add_openmp_testsuite(check-libomptarget-nvptx
"Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
EXCLUDE_FROM_CHECK_ALL
DEPENDS ${deps} ARGS -j1)
set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
"Extra compiler flags to send to the test compiler.")
set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS
"-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING
"OpenMP compiler flags to use for testing libomptarget-nvptx.")
# Configure the lit.site.cfg.in file
set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!")
configure_file(lit.site.cfg.in lit.site.cfg @ONLY)

View File

@ -1,22 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
int main(){
int max_threads = -1;
int num_threads = -1;
#pragma omp target map(tofrom: max_threads)
max_threads = omp_get_max_threads();
#pragma omp target parallel map(tofrom: num_threads)
{
#pragma omp master
num_threads = omp_get_num_threads();
}
// CHECK: Max Threads: 128, Num Threads: 128
printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads);
return 0;
}

View File

@ -1,38 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
const int MaxThreads = 1024;
int main(int argc, char *argv[]) {
int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1;
#pragma omp target map(cancellation, dynamic, nested, maxActiveLevels)
{
// libomptarget-nvptx doesn't support cancellation.
cancellation = omp_get_cancellation();
// No support for dynamic adjustment of the number of threads.
omp_set_dynamic(1);
dynamic = omp_get_dynamic();
// libomptarget-nvptx doesn't support nested parallelism.
omp_set_nested(1);
nested = omp_get_nested();
omp_set_max_active_levels(42);
maxActiveLevels = omp_get_max_active_levels();
}
// CHECK: cancellation = 0
printf("cancellation = %d\n", cancellation);
// CHECK: dynamic = 0
printf("dynamic = %d\n", dynamic);
// CHECK: nested = 0
printf("nested = %d\n", nested);
// CHECK: maxActiveLevels = 1
printf("maxActiveLevels = %d\n", maxActiveLevels);
return 0;
}

View File

@ -1,53 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
int main(int argc, char *argv[]) {
int MaxThreadsL1 = -1, MaxThreadsL2 = -1;
#pragma omp declare reduction(unique:int \
: omp_out = (omp_in == 1 ? omp_in : omp_out)) \
initializer(omp_priv = -1)
// Non-SPMD mode.
#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32) \
num_teams(1)
{
MaxThreadsL1 = omp_get_max_threads();
#pragma omp parallel reduction(unique : MaxThreadsL2)
{ MaxThreadsL2 = omp_get_max_threads(); }
}
//FIXME: This Non-SPMD kernel will have 32 active threads due to
// thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of
// threads in block (64 in this case), which translates to worker
// threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD
// kernels. According to the spec, omp_get_max_threads must return the
// max active threads possible between the two kernel types.
// CHECK: Non-SPMD MaxThreadsL1 = 64
printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
// CHECK: Non-SPMD MaxThreadsL2 = 1
printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
// SPMD mode with full runtime
MaxThreadsL2 = -1;
#pragma omp target parallel reduction(unique : MaxThreadsL2)
{ MaxThreadsL2 = omp_get_max_threads(); }
// CHECK: SPMD with full runtime MaxThreadsL2 = 1
printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
// SPMD mode without runtime
MaxThreadsL2 = -1;
#pragma omp target parallel for reduction(unique : MaxThreadsL2)
for (int I = 0; I < 2; ++I) {
MaxThreadsL2 = omp_get_max_threads();
}
// CHECK: SPMD without runtime MaxThreadsL2 = 1
printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
return 0;
}

View File

@ -1,72 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
int main(int argc, char *argv[]) {
int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1;
#pragma omp declare reduction(unique64:int \
: omp_out = (omp_in == 64 ? omp_in : omp_out)) \
initializer(omp_priv = -1)
#pragma omp declare reduction(unique32:int \
: omp_out = (omp_in == 32 ? omp_in : omp_out)) \
initializer(omp_priv = -1)
// Non-SPMD mode.
#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2) \
thread_limit(64) num_teams(1)
{
ThreadLimitL0 = omp_get_thread_limit();
#pragma omp parallel reduction(unique64 \
: ThreadLimitL1, ThreadLimitL2) num_threads(32)
{
ThreadLimitL1 = omp_get_thread_limit();
#pragma omp parallel reduction(unique64 : ThreadLimitL2)
{ ThreadLimitL2 = omp_get_thread_limit(); }
}
}
// CHECK: Non-SPMD ThreadLimitL0 = 64
printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0);
// CHECK: Non-SPMD ThreadLimitL1 = 64
printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1);
// CHECK: Non-SPMD ThreadLimitL2 = 64
printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2);
// SPMD mode with full runtime
ThreadLimitL1 = -1;
ThreadLimitL2 = -1;
#pragma omp target parallel reduction(unique32 \
: ThreadLimitL1, ThreadLimitL2) \
num_threads(32)
{
ThreadLimitL1 = omp_get_thread_limit();
#pragma omp parallel reduction(unique32 : ThreadLimitL2)
{ ThreadLimitL2 = omp_get_thread_limit(); }
}
// CHECK: SPMD with full runtime ThreadLimitL1 = 32
printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
// CHECK: SPMD with full runtime ThreadLimitL2 = 32
printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
// SPMD mode without runtime
ThreadLimitL1 = -1;
ThreadLimitL2 = -1;
#pragma omp target parallel for reduction(unique32 \
: ThreadLimitL1, ThreadLimitL2) \
num_threads(32)
for (int I = 0; I < 2; ++I) {
ThreadLimitL1 = omp_get_thread_limit();
#pragma omp parallel reduction(unique32 : ThreadLimitL2)
{ ThreadLimitL2 = omp_get_thread_limit(); }
}
// CHECK: SPMD without runtime ThreadLimitL1 = 32
printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
// CHECK: SPMD without runtime ThreadLimitL2 = 32
printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
return 0;
}

View File

@ -1,55 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
#pragma omp declare target
static void putValueInParallel(int *ptr, int value) {
#pragma omp parallel
{
*ptr = value;
}
}
static int getId() {
int id;
putValueInParallel(&id, omp_get_thread_num());
return id;
}
#pragma omp end declare target
const int MaxThreads = 1024;
const int Threads = 64;
int main(int argc, char *argv[]) {
int master;
int check[MaxThreads];
for (int i = 0; i < MaxThreads; i++) {
check[i] = 0;
}
#pragma omp target map(master, check[:])
{
master = getId();
#pragma omp parallel num_threads(Threads)
{
check[omp_get_thread_num()] = getId();
}
}
// CHECK: master = 0.
printf("master = %d.\n", master);
// CHECK-NOT: invalid
for (int i = 0; i < MaxThreads; i++) {
if (i < Threads) {
if (check[i] != i) {
printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]);
}
} else if (check[i] != 0) {
printf("invalid: check[%d] should be 0, is %d\n", i, check[i]);
}
}
return 0;
}

View File

@ -1,76 +0,0 @@
# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
# Configuration file for the 'lit' test runner.
import os
import lit.formats
# Tell pylint that we know config and lit_config exist somewhere.
if 'PYLINT_IMPORT' in os.environ:
config = object()
lit_config = object()
def prepend_library_path(name, value, sep):
if name in config.environment:
config.environment[name] = value + sep + config.environment[name]
else:
config.environment[name] = value
# name: The name of this test suite.
config.name = 'libomptarget-nvptx'
# suffixes: A list of file extensions to treat as test files.
config.suffixes = ['.c', '.cpp', '.cc']
# test_source_root: The root path where tests are located.
config.test_source_root = os.path.dirname(__file__)
# test_exec_root: The root object directory where output is placed
config.test_exec_root = config.binary_dir
# test format
config.test_format = lit.formats.ShTest()
# compiler flags
config.test_flags = " -I " + config.omp_header_directory + \
" -L " + config.library_dir
if config.omp_host_rtl_directory:
config.test_flags = config.test_flags + \
" -L " + config.omp_host_rtl_directory
config.test_flags = config.test_flags + " " + config.test_extra_flags
# Setup environment to find dynamic library at runtime.
prepend_library_path('LIBRARY_PATH', config.library_dir, ":")
prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
if config.cuda_libdir:
prepend_library_path('LD_LIBRARY_PATH', config.cuda_libdir, ":")
# Forbid fallback to host.
config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
# substitutions
config.substitutions.append(("%compilexx-run-and-check",
"%compilexx-and-run | " + config.libomptarget_filecheck + " %s"))
config.substitutions.append(("%compile-run-and-check",
"%compile-and-run | " + config.libomptarget_filecheck + " %s"))
config.substitutions.append(("%compilexx-and-run", "%compilexx && %run"))
config.substitutions.append(("%compile-and-run", "%compile && %run"))
config.substitutions.append(("%compilexx",
"%clangxx %openmp_flags %cuda_flags %flags %s -o %t"))
config.substitutions.append(("%compile",
"%clang %openmp_flags %cuda_flags %flags %s -o %t"))
config.substitutions.append(("%clangxx", config.test_cxx_compiler))
config.substitutions.append(("%clang", config.test_c_compiler))
config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
if config.cuda_path:
config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
else:
config.substitutions.append(("%cuda_flags", ""))
config.substitutions.append(("%flags", config.test_flags))
config.substitutions.append(("%run", "%t"))
config.substitutions.append(("%not", config.libomptarget_not))

View File

@ -1,17 +0,0 @@
@AUTO_GEN_COMMENT@
config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@"
config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@"
config.cuda_path = "@CUDA_TOOLKIT_ROOT_DIR@"
config.cuda_libdir = "@CUDA_LIBDIR@"
config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@"
config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
# Let the main config do the real work.
lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")

View File

@ -1,37 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
int main(int argc, char *argv[]) {
int data, out, flag = 0;
#pragma omp target teams num_teams(2) map(tofrom \
: out) map(to \
: data, flag) \
thread_limit(1)
#pragma omp parallel num_threads(1)
{
if (omp_get_team_num() == 0) {
/* Write to the data buffer that will be read by thread in team 1 */
data = 42;
/* Flush data to thread in team 1 */
#pragma omp barrier
/* Set flag to release thread in team 1 */
#pragma omp atomic write
flag = 1;
} else if (omp_get_team_num() == 1) {
/* Loop until we see the update to the flag */
int val;
do {
#pragma omp atomic read
val = flag;
} while (val < 1);
out = data;
#pragma omp barrier
}
}
// CHECK: out=42.
/* Value of out will be 42 */
printf("out=%d.\n", out);
return !(out == 42);
}

View File

@ -1,35 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
int main(int argc, char *argv[]) {
int data, out, flag = 0;
#pragma omp target parallel num_threads(64) map(tofrom \
: out, flag) map(to \
: data)
{
if (omp_get_thread_num() == 0) {
/* Write to the data buffer that will be read by thread */
data = 42;
/* Flush data to thread 32 */
#pragma omp flush(data)
/* Set flag to release thread 32 */
#pragma omp atomic write
flag = 1;
} else if (omp_get_thread_num() == 32) {
/* Loop until we see the update to the flag */
int val;
do {
#pragma omp atomic read
val = flag;
} while (val < 1);
out = data;
#pragma omp flush(out)
}
}
// CHECK: out=42.
/* Value of out will be 42 */
printf("out=%d.\n", out);
return !(out == 42);
}

View File

@ -1,151 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
const int MaxThreads = 1024;
const int NumThreads = 64;
int main(int argc, char *argv[]) {
int level = -1, activeLevel = -1;
// The expected value is -1, initialize to different value.
int ancestorTNumNeg = 1, teamSizeNeg = 1;
int ancestorTNum0 = -1, teamSize0 = -1;
// The expected value is -1, initialize to different value.
int ancestorTNum1 = 1, teamSize1 = 1;
int check1[MaxThreads];
int check2[MaxThreads];
int check3[MaxThreads];
int check4[MaxThreads];
for (int i = 0; i < MaxThreads; i++) {
check1[i] = check2[i] = check3[i] = check4[i] = 0;
}
#pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \
map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \
map(check1[:], check2[:], check3[:], check4[:])
{
level = omp_get_level();
activeLevel = omp_get_active_level();
// Expected to return -1.
ancestorTNumNeg = omp_get_ancestor_thread_num(-1);
teamSizeNeg = omp_get_team_size(-1);
// Expected to return 0 and 1.
ancestorTNum0 = omp_get_ancestor_thread_num(0);
teamSize0 = omp_get_team_size(0);
// Expected to return -1 because the requested level is larger than
// the nest level.
ancestorTNum1 = omp_get_ancestor_thread_num(1);
teamSize1 = omp_get_team_size(1);
// Expecting active parallel region.
#pragma omp parallel num_threads(NumThreads)
{
int id = omp_get_thread_num();
// Multiply return value of omp_get_level by 5 to avoid that this test
// passes if both API calls return wrong values.
check1[id] += omp_get_level() * 5 + omp_get_active_level();
// Expected to return 0 and 1.
check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
// Expected to return the current thread num.
check2[id] += (omp_get_ancestor_thread_num(1) - id);
// Expected to return the current number of threads.
check2[id] += 3 * omp_get_team_size(1);
// Expected to return -1, see above.
check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
// Expecting serialized parallel region.
#pragma omp parallel
{
#pragma omp atomic
check3[id] += omp_get_level() * 5 + omp_get_active_level();
// Expected to return 0 and 1.
int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
// Expected to return the parent thread num.
check4Inc += (omp_get_ancestor_thread_num(1) - id);
// Expected to return the number of threads in the active parallel region.
check4Inc += 3 * omp_get_team_size(1);
// Expected to return 0 and 1.
check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
// Expected to return -1, see above.
check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
#pragma omp atomic
check4[id] += check4Inc;
}
}
}
// CHECK: target: level = 0, activeLevel = 0
printf("target: level = %d, activeLevel = %d\n", level, activeLevel);
// CHECK: level = -1: ancestorTNum = -1, teamSize = -1
printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg);
// CHECK: level = 0: ancestorTNum = 0, teamSize = 1
printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0);
// CHECK: level = 1: ancestorTNum = -1, teamSize = -1
printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1);
// CHECK-NOT: invalid
for (int i = 0; i < MaxThreads; i++) {
// Check active parallel region:
// omp_get_level() = 1, omp_get_active_level() = 1
const int Expected1 = 6;
if (i < NumThreads) {
if (check1[i] != Expected1) {
printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]);
}
} else if (check1[i] != 0) {
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
}
// 5 * 1 + 3 * 64 - 1 - 1 (see above)
const int Expected2 = 195;
if (i < NumThreads) {
if (check2[i] != Expected2) {
printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]);
}
} else if (check2[i] != 0) {
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
}
// Check serialized parallel region:
// omp_get_level() = 2, omp_get_active_level() = 1
const int Expected3 = 11;
if (i < NumThreads) {
if (check3[i] != Expected3) {
printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]);
}
} else if (check3[i] != 0) {
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
}
// 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above)
const int Expected4 = 198;
if (i < NumThreads) {
if (check4[i] != Expected4) {
printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]);
}
} else if (check4[i] != 0) {
printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
}
}
// Check for paraller level in non-SPMD kernels.
level = 0;
#pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
for (int i=0; i<5032; i+=32) {
int ub = (i+32 > 5032) ? 5032 : i+32;
#pragma omp parallel for schedule(dynamic)
for (int j=i ; j < ub; j++) ;
level += omp_get_level();
}
// CHECK: Integral level = 0.
printf("Integral level = %d.\n", level);
return 0;
}

View File

@ -1,136 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
const int MaxThreads = 1024;
const int NumThreads = 64;
const int NumThreads1 = 1;
int main(int argc, char *argv[]) {
int inParallel = -1, numThreads = -1, threadNum = -1;
int check1[MaxThreads];
int check2[MaxThreads];
for (int i = 0; i < MaxThreads; i++) {
check1[i] = check2[i] = 0;
}
#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
{
inParallel = omp_in_parallel();
numThreads = omp_get_num_threads();
threadNum = omp_get_thread_num();
// Expecting active parallel region.
#pragma omp parallel num_threads(NumThreads)
{
int id = omp_get_thread_num();
check1[id] += omp_get_num_threads() + omp_in_parallel();
// Expecting serialized parallel region.
#pragma omp parallel
{
// Expected to be 1.
int nestedInParallel = omp_in_parallel();
// Expected to be 1.
int nestedNumThreads = omp_get_num_threads();
// Expected to be 0.
int nestedThreadNum = omp_get_thread_num();
#pragma omp atomic
check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
}
}
}
// CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
inParallel, numThreads, threadNum);
// CHECK-NOT: invalid
for (int i = 0; i < MaxThreads; i++) {
// Check that all threads reported
// omp_get_num_threads() = 64, omp_in_parallel() = 1.
int Expected = NumThreads + 1;
if (i < NumThreads) {
if (check1[i] != Expected) {
printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
check1[i]);
}
} else if (check1[i] != 0) {
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
}
// Check serialized parallel region.
if (i < NumThreads) {
if (check2[i] != 2) {
printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]);
}
} else if (check2[i] != 0) {
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
}
}
inParallel = -1;
numThreads = -1;
threadNum = -1;
for (int i = 0; i < MaxThreads; i++) {
check1[i] = check2[i] = 0;
}
#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
{
inParallel = omp_in_parallel();
numThreads = omp_get_num_threads();
threadNum = omp_get_thread_num();
// Expecting active parallel region.
#pragma omp parallel num_threads(NumThreads1)
{
int id = omp_get_thread_num();
check1[id] += omp_get_num_threads() + omp_in_parallel();
// Expecting serialized parallel region.
#pragma omp parallel
{
// Expected to be 0.
int nestedInParallel = omp_in_parallel();
// Expected to be 1.
int nestedNumThreads = omp_get_num_threads();
// Expected to be 0.
int nestedThreadNum = omp_get_thread_num();
#pragma omp atomic
check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
}
}
}
// CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
inParallel, numThreads, threadNum);
// CHECK-NOT: invalid
for (int i = 0; i < MaxThreads; i++) {
// Check that all threads reported
// omp_get_num_threads() = 1, omp_in_parallel() = 0.
int Expected = 1;
if (i < NumThreads1) {
if (check1[i] != Expected) {
printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
check1[i]);
}
} else if (check1[i] != 0) {
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
}
// Check serialized parallel region.
if (i < NumThreads1) {
if (check2[i] != 1) {
printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
}
} else if (check2[i] != 0) {
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
}
}
return 0;
}

View File

@ -1,102 +0,0 @@
// RUN: %compile-run-and-check
#include <stdio.h>
#include <omp.h>
const int WarpSize = 32;
const int NumThreads1 = 1 * WarpSize;
const int NumThreads2 = 2 * WarpSize;
const int NumThreads3 = 3 * WarpSize;
const int MaxThreads = 1024;
int main(int argc, char *argv[]) {
int check1[MaxThreads];
int check2[MaxThreads];
int check3[MaxThreads];
int check4[MaxThreads];
for (int i = 0; i < MaxThreads; i++) {
check1[i] = check2[i] = check3[i] = check4[i] = 0;
}
int maxThreads1 = -1;
int maxThreads2 = -1;
int maxThreads3 = -1;
#pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
map(maxThreads1, maxThreads2, maxThreads3)
{
#pragma omp parallel num_threads(NumThreads1)
{
check1[omp_get_thread_num()] += omp_get_num_threads();
}
// API method to set number of threads in parallel regions without
// num_threads() clause.
omp_set_num_threads(NumThreads2);
maxThreads1 = omp_get_max_threads();
#pragma omp parallel
{
check2[omp_get_thread_num()] += omp_get_num_threads();
}
maxThreads2 = omp_get_max_threads();
// num_threads() clause should override nthreads-var ICV.
#pragma omp parallel num_threads(NumThreads3)
{
check3[omp_get_thread_num()] += omp_get_num_threads();
}
maxThreads3 = omp_get_max_threads();
// Effect from omp_set_num_threads() should still be visible.
#pragma omp parallel
{
check4[omp_get_thread_num()] += omp_get_num_threads();
}
}
// CHECK: maxThreads1 = 64
printf("maxThreads1 = %d\n", maxThreads1);
// CHECK: maxThreads2 = 64
printf("maxThreads2 = %d\n", maxThreads2);
// CHECK: maxThreads3 = 64
printf("maxThreads3 = %d\n", maxThreads3);
// CHECK-NOT: invalid
for (int i = 0; i < MaxThreads; i++) {
if (i < NumThreads1) {
if (check1[i] != NumThreads1) {
printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
}
} else if (check1[i] != 0) {
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
}
if (i < NumThreads2) {
if (check2[i] != NumThreads2) {
printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
}
} else if (check2[i] != 0) {
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
}
if (i < NumThreads3) {
if (check3[i] != NumThreads3) {
printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
}
} else if (check3[i] != 0) {
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
}
if (i < NumThreads2) {
if (check4[i] != NumThreads2) {
printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
}
} else if (check4[i] != 0) {
printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
}
}
return 0;
}

View File

@ -1,51 +0,0 @@
// RUN: %compilexx-run-and-check
#include <stdio.h>
#include <omp.h>
int main(void) {
int isHost = -1;
int ParallelLevel1 = -1, ParallelLevel2 = -1;
int Count = 0;
#pragma omp target parallel for map(tofrom \
: isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
for (int J = 0; J < 10; ++J) {
#pragma omp critical
{
isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
? omp_get_level()
: ParallelLevel1;
}
if (omp_get_thread_num() > 5) {
int L2;
#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
for (int I = 0; I < 10; ++I) {
L2 = omp_get_level();
Count += omp_get_level(); // (10-6)*10*2 = 80
}
#pragma omp critical
ParallelLevel2 =
(ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
} else {
Count += omp_get_level(); // 6 * 1 = 6
}
}
if (isHost < 0) {
printf("Runtime error, isHost=%d\n", isHost);
}
// CHECK: Target region executed on the device
printf("Target region executed on the %s\n", isHost ? "host" : "device");
// CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
ParallelLevel2);
// Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
// level) + 6(num of iterations) * 1(par level)
// CHECK: Expected count = 86
printf("Expected count = %d\n", Count);
return isHost;
}

View File

@ -1,77 +0,0 @@
// RUN: %compile-run-and-check
#include <stdio.h>
#include <omp.h>
const int WarpSize = 32;
const int ThreadLimit = 1 * WarpSize;
const int NumThreads2 = 2 * WarpSize;
const int NumThreads3 = 3 * WarpSize;
const int MaxThreads = 1024;
int main(int argc, char *argv[]) {
int check1[MaxThreads];
int check2[MaxThreads];
int check3[MaxThreads];
for (int i = 0; i < MaxThreads; i++) {
check1[i] = check2[i] = check3[i] = 0;
}
int threadLimit = -1;
#pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \
map(check1[:], check2[:], check3[:], threadLimit)
{
threadLimit = omp_get_thread_limit();
// All parallel regions should get as many threads as specified by the
// thread_limit() clause.
#pragma omp parallel
{
check1[omp_get_thread_num()] += omp_get_num_threads();
}
omp_set_num_threads(NumThreads2);
#pragma omp parallel
{
check2[omp_get_thread_num()] += omp_get_num_threads();
}
#pragma omp parallel num_threads(NumThreads3)
{
check3[omp_get_thread_num()] += omp_get_num_threads();
}
}
// CHECK: threadLimit = 32
printf("threadLimit = %d\n", threadLimit);
// CHECK-NOT: invalid
for (int i = 0; i < MaxThreads; i++) {
if (i < ThreadLimit) {
if (check1[i] != ThreadLimit) {
printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]);
}
} else if (check1[i] != 0) {
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
}
if (i < ThreadLimit) {
if (check2[i] != ThreadLimit) {
printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]);
}
} else if (check2[i] != 0) {
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
}
if (i < ThreadLimit) {
if (check3[i] != ThreadLimit) {
printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]);
}
} else if (check3[i] != 0) {
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
}
}
return 0;
}

View File

@ -1,22 +0,0 @@
// RUN: %compile-run-and-check
#include <omp.h>
#include <stdio.h>
int main() {
int res = 0;
#pragma omp parallel num_threads(2) reduction(+:res)
{
int tid = omp_get_thread_num();
#pragma omp target teams distribute reduction(+:res)
for (int i = tid; i < 2; i++)
++res;
}
// The first thread makes 2 iterations, the second - 1. Expected result of the
// reduction res is 3.
// CHECK: res = 3.
printf("res = %d.\n", res);
return 0;
}

View File

@ -1,78 +0,0 @@
//===------------- target_interface.h - Target interfaces --------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains interfaces that must be implemented by each target.
//
//===----------------------------------------------------------------------===//
#ifndef _OMPTARGET_TARGET_INTERFACE_H_
#define _OMPTARGET_TARGET_INTERFACE_H_
#include <stdint.h>
#include "DeviceEnvironment.h"
#include "target_impl.h"
// Calls to the NVPTX layer (assuming 1D layout)
EXTERN int __kmpc_get_hardware_thread_id_in_block();
EXTERN int GetBlockIdInKernel();
EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
EXTERN unsigned __kmpc_get_warp_size();
EXTERN unsigned GetWarpId();
EXTERN unsigned GetLaneId();
// Atomics
uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned long long __kmpc_atomic_exchange(unsigned long long *,
unsigned long long);
unsigned long long __kmpc_atomic_add(unsigned long long *, unsigned long long);
// Locks
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
EXTERN void __kmpc_impl_threadfence();
EXTERN void __kmpc_impl_threadfence_block();
EXTERN void __kmpc_impl_threadfence_system();
EXTERN double __kmpc_impl_get_wtick();
EXTERN double __kmpc_impl_get_wtime();
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
EXTERN uint32_t __kmpc_impl_smid();
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
EXTERN void __kmpc_impl_syncthreads();
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
// Kernel initialization
EXTERN void __kmpc_impl_target_init();
// Memory
EXTERN void *__kmpc_impl_malloc(size_t);
EXTERN void __kmpc_impl_free(void *);
// Barrier until num_threads arrive.
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads);
extern DeviceEnvironmentTy omptarget_device_environment;
#endif // _OMPTARGET_TARGET_INTERFACE_H_

View File

@ -118,6 +118,6 @@ if (${amdgpu_arch_result})
libomptarget_say("Not generating amdgcn test targets as amdgpu-arch exited with ${amdgpu_arch_result}")
else()
# Report to the parent scope that we are building a plugin for amdgpu
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL " PARENT_SCOPE)
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa " PARENT_SCOPE)
endif()

View File

@ -72,7 +72,7 @@ target_link_libraries(omptarget.rtl.cuda
# Otherwise this plugin is being built speculatively and there may be no cuda available
if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
libomptarget_say("Enable tests using CUDA plugin")
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-newRTL nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
else()
libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available")
endif()

View File

@ -1,4 +1,4 @@
// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
// RUN: %libomptarget-compile-nvptx64-nvidia-cuda
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
// REQUIRES: nvptx64-nvidia-cuda

View File

@ -104,17 +104,11 @@ else: # Unices
config.test_flags += " --libomptarget-amdgcn-bc-path=" + config.library_dir
if config.libomptarget_current_target.startswith('nvptx'):
config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
if config.libomptarget_current_target.endswith('-newRTL'):
config.test_flags += " -fopenmp-target-new-runtime"
elif not config.libomptarget_current_target.endswith('-newDriver'):
config.test_flags += " -fno-openmp-target-new-runtime"
if config.libomptarget_current_target.endswith('-newDriver'):
config.test_flags += " -fopenmp-new-driver"
def remove_newRTL_suffix_if_present(name):
if name.endswith('-newRTL'):
return name[:-7]
elif name.endswith('-newDriver'):
def remove_suffix_if_present(name):
if name.endswith('-newDriver'):
return name[:-10]
else:
return name
@ -183,10 +177,10 @@ for libomptarget_target in config.libomptarget_all_targets:
"%not --crash %t"))
config.substitutions.append(("%clangxx-" + libomptarget_target, \
"%clangxx %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
remove_newRTL_suffix_if_present(libomptarget_target)))
remove_suffix_if_present(libomptarget_target)))
config.substitutions.append(("%clang-" + libomptarget_target, \
"%clang %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
remove_newRTL_suffix_if_present(libomptarget_target)))
remove_suffix_if_present(libomptarget_target)))
config.substitutions.append(("%fcheck-" + libomptarget_target, \
config.libomptarget_filecheck + " %s"))
else:

View File

@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>

View File

@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <cstdio>
#include <cstdlib>

View File

@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <cstdio>
#include <cstdlib>

View File

@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>
#include <stdint.h>

View File

@ -1,7 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
// Error on the gpu that crashes the host
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// UNSUPPORTED: amdgcn-amd-amdhsa
#include <iostream>

View File

@ -3,7 +3,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <omp.h>
#include <stdio.h>

View File

@ -2,7 +2,6 @@
// Hangs
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// UNSUPPORTED: amdgcn-amd-amdhsa-newDriver
#include <iostream>

View File

@ -2,7 +2,6 @@
// Currently hangs on amdgpu
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// UNSUPPORTED: x86_64-pc-linux-gnu
#include <cassert>

View File

@ -34,7 +34,6 @@
// Hangs
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// UNSUPPORTED: amdgcn-amd-amdhsa-newDriver
#if ADD_REDUCTION

View File

@ -2,7 +2,6 @@
// Fails in DAGToDAG on an address space problem
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
#include <cmath>
#include <cstdio>

View File

@ -9,7 +9,6 @@
// amdgpu does not have a working printf definition
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>
#include <omp.h>

View File

@ -1,11 +1,10 @@
// RUN: %libomptarget-compile-run-and-check-generic
// XFAIL: nvptx64-nvidia-cuda
// XFAIL: nvptx64-nvidia-cuda-newRTL
// XFAIL: nvptx64-nvidia-cuda
// XFAIL: nvptx64-nvidia-cuda-newDriver
// Fails on amdgpu with error: GPU Memory Error
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
// XFAIL: amdgcn-amd-amdhsa-newDriver
#include <stdio.h>

View File

@ -5,7 +5,6 @@
// Fails on amdgpu with error: GPU Memory Error
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <omp.h>
#include <stdio.h>

View File

@ -5,7 +5,6 @@
// amdgpu runtime crash
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
#include <omp.h>

View File

@ -4,7 +4,6 @@
// amdgpu runtime crash
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>
#include <omp.h>