[OpenMP] Completely remove old device runtime

This patch completely removes the old OpenMP device runtime. Previously, the old runtime had the prefix `libomptarget-new-` and the old runtime was simply called `libomptarget-`. This patch makes the formerly new runtime the only runtime available. The entire project has been deleted, and all references to the `libomptarget-new` runtime has been replaced with `libomptarget-`. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D118934
2022-02-03 14:43:40 -05:00 · 2022-02-03 14:43:40 -05:00 · 034adaf5be
parent 0cc6165d05
commit 034adaf5be
82 changed files with 38 additions and 8084 deletions
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@ -1203,8 +1203,7 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
    llvm_unreachable("OpenMP can only handle device code.");

  llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
-  if (CGM.getLangOpts().OpenMPTargetNewRuntime &&
-      !CGM.getLangOpts().OMPHostIRFile.empty()) {
+  if (!CGM.getLangOpts().OMPHostIRFile.empty()) {
    OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
                                "__omp_rtl_debug_kind");
    OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@ -290,11 +290,7 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
    return;

  std::string BitcodeSuffix;
-  if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
-                         options::OPT_fno_openmp_target_new_runtime, true))
-    BitcodeSuffix = "new-amdgpu-" + GPUArch;
-  else
-    BitcodeSuffix = "amdgcn-" + GPUArch;
+  BitcodeSuffix = "amdgcn-" + GPUArch;

  addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
                     getTriple());
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@ -5936,13 +5936,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                       options::OPT_fno_openmp_cuda_mode, /*Default=*/false))
        CmdArgs.push_back("-fopenmp-cuda-mode");

-      // When in OpenMP offloading mode, enable or disable the new device
-      // runtime.
-      if (Args.hasFlag(options::OPT_fopenmp_target_new_runtime,
-                       options::OPT_fno_openmp_target_new_runtime,
-                       /*Default=*/true))
-        CmdArgs.push_back("-fopenmp-target-new-runtime");
-
      // When in OpenMP offloading mode, enable debugging on the device.
      Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_target_debug_EQ);
      if (Args.hasFlag(options::OPT_fopenmp_target_debug,
@ -8187,9 +8180,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
      StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ);

      std::string BitcodeSuffix;
-      if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
-                         options::OPT_fno_openmp_target_new_runtime, true))
-        BitcodeSuffix += "new-";
      if (TC->getTriple().isNVPTX())
        BitcodeSuffix += "nvptx-";
      else if (TC->getTriple().isAMDGPU())
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@ -749,11 +749,7 @@ void CudaToolChain::addClangTargetOptions(
      return;

    std::string BitcodeSuffix;
-    if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
-                           options::OPT_fno_openmp_target_new_runtime, true))
-      BitcodeSuffix = "new-nvptx-" + GpuArch.str();
-    else
-      BitcodeSuffix = "nvptx-" + GpuArch.str();
+    BitcodeSuffix = "nvptx-" + GpuArch.str();

    addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
                       getTriple());
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@ -3484,9 +3484,6 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
      GenerateArg(Args, OPT_fopenmp_version_EQ, Twine(Opts.OpenMP), SA);
  }

-  if (Opts.OpenMPTargetNewRuntime)
-    GenerateArg(Args, OPT_fopenmp_target_new_runtime, SA);
-
  if (Opts.OpenMPThreadSubscription)
    GenerateArg(Args, OPT_fopenmp_assume_threads_oversubscription, SA);

@ -3877,9 +3874,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
      Opts.OpenMP && Args.hasArg(options::OPT_fopenmp_enable_irbuilder);
  bool IsTargetSpecified =
      Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ);
-  Opts.OpenMPTargetNewRuntime =
-      Opts.OpenMPIsDevice &&
-      Args.hasArg(options::OPT_fopenmp_target_new_runtime);

  Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice;

@ -3927,17 +3921,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
  // Set either by a specific value or to a default if not specified.
  if (Opts.OpenMPIsDevice && (Args.hasArg(OPT_fopenmp_target_debug) ||
                              Args.hasArg(OPT_fopenmp_target_debug_EQ))) {
-    if (Opts.OpenMPTargetNewRuntime) {
-      Opts.OpenMPTargetDebug = getLastArgIntValue(
-          Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
-      if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
-        Opts.OpenMPTargetDebug = 1;
-    } else {
-      Diags.Report(diag::err_drv_debug_no_new_runtime);
-    }
+    Opts.OpenMPTargetDebug = getLastArgIntValue(
+        Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
+    if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
+      Opts.OpenMPTargetDebug = 1;
  }

-  if (Opts.OpenMPIsDevice && Opts.OpenMPTargetNewRuntime) {
+  if (Opts.OpenMPIsDevice) {
    if (Args.hasArg(OPT_fopenmp_assume_teams_oversubscription))
      Opts.OpenMPTeamSubscription = true;
    if (Args.hasArg(OPT_fopenmp_assume_threads_oversubscription))
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
 // RUN:   | FileCheck %s

 // verify the tools invocations
@ -14,7 +14,7 @@
 // CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc"
 // CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget"

-// RUN:   %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
+// RUN:   %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PHASES %s
 // phases
 // CHECK-PHASES: 0: input, "{{.*}}amdgpu-openmp-toolchain.c", c, (host-openmp)
@ -36,13 +36,13 @@
 // CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp)

 // handling of --libomptarget-amdgcn-bc-path
-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
 // CHECK-LIBOMPTARGET: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc"{{.*}}

-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
 // CHECK-NOGPULIB-NOT: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}libomptarget-amdgcn-gfx803.bc"{{.*}}

-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
 // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"],
 // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang",{{.*}} output: "[[HOST_BC:.*]]"
 // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]"], output: "[[HOST_S:.*]]"
@ -56,13 +56,13 @@
 // CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[HOST_O]]", "[[OFFLOAD_O]]"], output:

 // verify the llc is invoked for textual assembly output
-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-SAVE-ASM
 // CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=asm" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.s"
 // CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.o"

 // check the handling of -c
-// RUN:   %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
+// RUN:   %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-C
 // CHECK-C: "x86_64-unknown-linux-gnu" - "clang",
 // CHECK-C: "x86_64-unknown-linux-gnu" - "clang",{{.*}}output: "[[HOST_BC:.*]]"
@ -72,8 +72,8 @@
 // CHECK-C: "x86_64-unknown-linux-gnu" - "clang::as"
 // CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler"

-// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
 // CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"

-// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
+// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
 // CHECK-LIB-DEVICE: {{.*}}llvm-link{{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@ -155,43 +155,24 @@
 // RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
 // RUN:   --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
 // RUN:   -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:   -fopenmp-relocatable-target -fopenmp-target-new-runtime -save-temps -no-canonical-prefixes %s 2>&1 \
+// RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB %s

 /// Specify the directory containing the bitcode lib, check clang picks the right one
 // RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
 // RUN:   --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget \
 // RUN:   -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:   -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
+// RUN:   -fopenmp-relocatable-target -save-temps \
 // RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-DIR %s

-/// Check with the new runtime enabled
-// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN:   -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:   -fopenmp-relocatable-target -fopenmp-target-new-runtime \
-// RUN:   --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \
-// RUN:   -save-temps -no-canonical-prefixes %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHK-BCLIB-NEW %s
-
-/// Check with new runtime and specifying the directory
-// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN:   -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-
-// RUN:   -fopenmp-relocatable-target -fopenmp-target-new-runtime \
-// RUN:   --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget -save-temps \
-// RUN:   -no-canonical-prefixes %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHK-BCLIB-NEW-DIR %s
-
 /// Create a bogus bitcode library and find it with LIBRARY_PATH
 // RUN:   env LIBRARY_PATH=%S/Inputs/libomptarget/subdir %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
 // RUN:   -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:   -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
+// RUN:   -fopenmp-relocatable-target -save-temps \
 // RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s

 // CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
 // CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_35.bc
-// CHK-BCLIB-NEW: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-new-nvptx-test.bc
-// CHK-BCLIB-NEW-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-new-nvptx-sm_35.bc
 // CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_35.bc
 // CHK-BCLIB-NOT: {{error:|warning:}}

@ -204,7 +185,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-WARN %s

-// CHK-BCLIB-WARN: no library 'libomptarget-new-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library
+// CHK-BCLIB-WARN: no library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library

 /// ###########################################################################

--- a/clang/test/OpenMP/target_globals_codegen.cpp
+++ b/clang/test/OpenMP/target_globals_codegen.cpp
@ -1,12 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex "__omp_rtl_"
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
 // expected-no-diagnostics

 #ifndef HEADER
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@ -38,13 +38,11 @@ endif()
 # This is a list of all the targets that are supported/tested right now.
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newRTL")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newDriver")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newRTL")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newDriver")

 # Once the plugins for the different targets are validated, they will be added to
@ -81,7 +79,6 @@ set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${LIBOMP_LIBRARY_DIR}" CACHE STRING

 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins)
-add_subdirectory(deviceRTLs)
 add_subdirectory(DeviceRTL)
 add_subdirectory(tools)

--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@ -180,7 +180,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)
    list(APPEND bc_files ${outfile})
  endforeach()

-  set(bclib_name "libomptarget-new-${target_name}-${target_cpu}.bc")
+  set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")

  # Link to a bitcode library.
  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
@ -212,7 +212,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)

  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})

-  set(bclib_target_name "omptarget-new-${target_name}-${target_cpu}-bc")
+  set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")

  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})

--- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/CMakeLists.txt
@ -1,14 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ##===----------------------------------------------------------------------===##
-#
-# Build a device RTL for each available machine.
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(amdgcn)
-add_subdirectory(nvptx)
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@ -1,193 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build the AMDGCN Device RTL bitcode library using clang -ffreestanding
-#
-##===----------------------------------------------------------------------===##
-
-set(LIBOMPTARGET_BUILD_AMDGCN_BCLIB FALSE CACHE BOOL
-  "Can be set to true to enable building this library.")
-
-if (NOT LIBOMPTARGET_BUILD_AMDGCN_BCLIB)
-  libomptarget_say("Not building AMDGCN device RTL: Disabled by LIBOMPTARGET_BUILD_AMDGCN_BCLIB")
-  return()
-endif()
-
-if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
-  libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
-  return()
-endif()
-
-
-# Copied from nvptx CMakeLists
-if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
-  set(aux_triple x86_64-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
-  set(aux_triple powerpc64le-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
-  set(aux_triple aarch64-unknown-linux-gnu)
-else()
-  libomptarget_say("Not building AMDGCN device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-  return()
-endif()
-
-if (LLVM_DIR)
-  # Builds that use pre-installed LLVM have LLVM_DIR set.
-  find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-  find_program(LINK_TOOL llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-  find_program(OPT_TOOL opt PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-  if ((NOT CLANG_TOOL) OR (NOT LINK_TOOL) OR (NOT OPT_TOOL))
-    libomptarget_say("Not building AMDGCN device RTL. Missing clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} or opt: ${OPT_TOOL}")
-    return()
-  else()
-    libomptarget_say("Building AMDGCN device RTL. Using clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} and opt: ${OPT_TOOL}")
-  endif()
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
-  # LLVM in-tree builds may use CMake target names to discover the tools.
-  set(CLANG_TOOL $<TARGET_FILE:clang>)
-  set(LINK_TOOL $<TARGET_FILE:llvm-link>)
-  set(OPT_TOOL $<TARGET_FILE:opt>)
-  libomptarget_say("Building AMDGCN device RTL. Using clang from in-tree build")
-else()
-  libomptarget_say("Not building AMDGCN device RTL. No appropriate clang found")
-  return()
-endif()
-
-project(omptarget-amdgcn)
-
-add_custom_target(omptarget-amdgcn ALL)
-
-#optimization level
-set(optimization_level 2)
-
-# Activate RTL message dumps if requested by the user.
-if(LIBOMPTARGET_NVPTX_DEBUG)
-  set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g)
-endif()
-
-get_filename_component(devicertl_base_directory
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  DIRECTORY)
-
-set(cuda_sources
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
-  ${devicertl_base_directory}/common/src/cancel.cu
-  ${devicertl_base_directory}/common/src/critical.cu
-  ${devicertl_base_directory}/common/src/data_sharing.cu
-  ${devicertl_base_directory}/common/src/libcall.cu
-  ${devicertl_base_directory}/common/src/loop.cu
-  ${devicertl_base_directory}/common/src/omp_data.cu
-  ${devicertl_base_directory}/common/src/omptarget.cu
-  ${devicertl_base_directory}/common/src/parallel.cu
-  ${devicertl_base_directory}/common/src/reduction.cu
-  ${devicertl_base_directory}/common/src/support.cu
-  ${devicertl_base_directory}/common/src/shuffle.cpp
-  ${devicertl_base_directory}/common/src/sync.cu
-  ${devicertl_base_directory}/common/src/task.cu)
-
-set(h_files
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
-  ${devicertl_base_directory}/common/debug.h
-  ${devicertl_base_directory}/common/omptarget.h
-  ${devicertl_base_directory}/common/omptargeti.h
-  ${devicertl_base_directory}/common/state-queue.h
-  ${devicertl_base_directory}/common/state-queuei.h
-  ${devicertl_base_directory}/common/support.h)
-
-# for both in-tree and out-of-tree build
-if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
-  set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-else()
-  set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
-endif()
-
-# create gfx bitcode libraries
-set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx90a gfx1010 gfx1030 gfx1031)
-if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
-  set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
-endif()
-
-# Prepend -I to each list element
-set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
-list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I")
-
-macro(add_cuda_bc_library)
-  set(cu_cmd ${CLANG_TOOL}
-    -xc++
-    -c
-    -mllvm -openmp-opt-disable
-    -std=c++14
-    -ffreestanding
-    -target amdgcn-amd-amdhsa
-    -emit-llvm
-    -Xclang -aux-triple -Xclang ${aux_triple}
-    -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-    -D__AMDGCN__
-    -Xclang -target-cpu -Xclang ${mcpu}
-    -fvisibility=hidden
-    -Wno-unused-value
-    -nogpulib
-    -O${optimization_level}
-    ${CUDA_DEBUG}
-    -I${CMAKE_CURRENT_SOURCE_DIR}/src
-    -I${devicertl_base_directory}/common/include
-    -I${devicertl_base_directory}
-    -I${devicertl_base_directory}/../include
-    ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN})
-
-  set(bc1_files)
-
-  foreach(file ${ARGN})
-    get_filename_component(fname ${file} NAME_WE)
-    set(bc1_filename ${fname}.${mcpu}.bc)
-
-    add_custom_command(
-      OUTPUT ${bc1_filename}
-      COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
-      DEPENDS ${file} ${h_files})
-
-    list(APPEND bc1_files ${bc1_filename})
-  endforeach()
-
-  add_custom_command(
-    OUTPUT linkout.cuda.${mcpu}.bc
-    COMMAND ${LINK_TOOL} ${bc1_files} -o linkout.cuda.${mcpu}.bc
-    DEPENDS ${bc1_files})
-
-  list(APPEND bc_files linkout.cuda.${mcpu}.bc)
-endmacro()
-
-set(libname "omptarget-amdgcn")
-
-set(toolchain_deps "")
-if(TARGET llvm-link)
-  list(APPEND toolchain_deps llvm-link)
-endif()
-if(TARGET opt)
-  list(APPEND toolchain_deps opt)
-endif()
-
-foreach(mcpu ${mcpus})
-  set(bc_files)
-  add_cuda_bc_library(${cuda_sources})
-
-  set(bc_libname lib${libname}-${mcpu}.bc)
-  add_custom_command(
-    OUTPUT ${bc_libname}
-    COMMAND ${LINK_TOOL} ${bc_files} | ${OPT_TOOL} --always-inline -o ${OUTPUTDIR}/${bc_libname}
-    DEPENDS ${bc_files} ${toolchain_deps})
-
-  add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
-
-  install(FILES ${OUTPUTDIR}/${bc_libname}
-     DESTINATION "${OPENMP_INSTALL_LIBDIR}"
-  )
-endforeach()
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
@ -1,19 +0,0 @@
-//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _AMDGCN_INTERFACE_H_
-#define _AMDGCN_INTERFACE_H_
-
-#include <stdint.h>
-
-#define EXTERN extern "C"
-typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
-
-EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
@ -1,34 +0,0 @@
-//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
-// cannot be implemented - if one thread gets the lock, it can't continue on to
-// the next instruction in order to do anything as the other threads are waiting
-// to take the lock.
-// These functions will be implemented to provide the documented semantics for
-// a SIMD => wavefront mapping once that is implemented.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-
-static void warn() {
-  PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
-}
-
-void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
-void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
-void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
-void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
-int __kmpc_impl_test_lock(omp_lock_t *lock) {
-  warn();
-  return 0;
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
@ -1,64 +0,0 @@
-//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "target_impl.h"
-
-// Partially derived fom hcc_detail/device_functions.h
-
-// HW_ID Register bit structure
-// WAVE_ID     3:0     Wave buffer slot number. 0-9.
-// SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
-// PIPE_ID     7:6     Pipeline from which the wave was dispatched.
-// CU_ID       11:8    Compute Unit the wave is assigned to.
-// SH_ID       12      Shader Array (within an SE) the wave is assigned to.
-// SE_ID       14:13   Shader Engine the wave is assigned to.
-// TG_ID       19:16   Thread-group ID
-// VM_ID       23:20   Virtual Memory ID
-// QUEUE_ID    26:24   Queue from which this wave was dispatched.
-// STATE_ID    29:27   State ID (graphics only, not compute).
-// ME_ID       31:30   Micro-engine ID.
-
-enum {
-  HW_ID = 4, // specify that the hardware register to read is HW_ID
-
-  HW_ID_CU_ID_SIZE = 4,   // size of CU_ID field in bits
-  HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
-
-  HW_ID_SE_ID_SIZE = 2,    // sizeof SE_ID field in bits
-  HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
-};
-
-// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
-// immediate and returns a 32 bit value.
-// The encoding of the immediate parameter is:
-// ID           5:0     Which register to read from
-// OFFSET       10:6    Range: 0..31
-// WIDTH        15:11   Range: 1..32
-
-// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
-// where hwreg forms a 16 bit immediate encoded by the assembler thus:
-// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
-//   return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
-// }
-#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
-
-// Note: The results can be changed by a context switch
-// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
-// bound on how many compute units are available. Some values in this
-// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
-
-EXTERN uint32_t __kmpc_impl_smid() {
-  uint32_t cu_id = __builtin_amdgcn_s_getreg(
-      ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
-  uint32_t se_id = __builtin_amdgcn_s_getreg(
-      ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
-  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@ -1,83 +0,0 @@
-//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations and definitions of target specific functions and constants
-//
-//===----------------------------------------------------------------------===//
-#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
-#define OMPTARGET_AMDGCN_TARGET_IMPL_H
-
-#ifndef __AMDGCN__
-#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
-#endif
-
-#include "amdgcn_interface.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-// subset of inttypes.h
-#define PRId64 "ld"
-#define PRIu64 "lu"
-
-typedef uint64_t __kmpc_impl_lanemask_t;
-
-#define INLINE inline
-#define NOINLINE __attribute__((noinline))
-#define ALIGN(N) __attribute__((aligned(N)))
-#define PLUGIN_ACCESSIBLE                                                      \
-  __attribute__((used))   /* Don't discard values the plugin reads */          \
-  __attribute__((weak))   /* We may have multiple definitions */               \
-  __attribute__((retain)) /* Also needed to keep values alive */               \
-  __attribute__((visibility("protected"))) /* Access via SHT_HASH */           \
-  __attribute__((section(".data")))        /* Not .bss, can write before load */
-
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-
-INLINE constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
-enum { WARPSIZE = getGridValue().GV_Warp_Size };
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#define OMP_STATE_COUNT 32
-#define MAX_SM 64
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
-  // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = getGridValue().GV_Slot_Size,
-  // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
-  // The maximum number of warps in use
-  DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
-};
-
-enum : __kmpc_impl_lanemask_t {
-  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
-};
-
-// The return code of printf is not checked in the call sites in this library.
-// A call to a function named printf currently hits some special case handling
-// for opencl, which translates to calls that do not presently exist for openmp
-// Therefore, for now, stub out printf while building this library.
-#define printf(...)
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@ -1,226 +0,0 @@
-//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-#include "target_interface.h"
-
-// Implementations initially derived from hcc
-
-// Initialized with a 64-bit mask with bits set in positions less than the
-// thread's lane number in the warp
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
-  uint32_t lane = GetLaneId();
-  int64_t ballot = __kmpc_impl_activemask();
-  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
-  return mask & ballot;
-}
-
-// Initialized with a 64-bit mask with bits set in positions greater than the
-// thread's lane number in the warp
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
-  uint32_t lane = GetLaneId();
-  if (lane == (WARPSIZE - 1))
-    return 0;
-  uint64_t ballot = __kmpc_impl_activemask();
-  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
-  return mask & ballot;
-}
-
-EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); }
-
-EXTERN double __kmpc_impl_get_wtime() {
-  // The intrinsics for measuring time have undocumented frequency
-  // This will probably need to be found by measurement on a number of
-  // architectures. Until then, return 0, which is very inaccurate as a
-  // timer but resolves the undefined symbol at link time.
-  return 0;
-}
-
-// Warp vote function
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-  return __builtin_amdgcn_read_exec();
-}
-
-static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
-  __atomic_thread_fence(__ATOMIC_ACQUIRE);
-
-  uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;
-
-  // Partial barrier implementation for amdgcn.
-  // Uses two 16 bit unsigned counters. One for the number of waves to have
-  // reached the barrier, and one to count how many times the barrier has been
-  // passed. These are packed in a single atomically accessed 32 bit integer.
-  // Low bits for the number of waves, assumed zero before this call.
-  // High bits to count the number of times the barrier has been passed.
-
-  // precondition: num_waves != 0;
-  // invariant: num_waves * WARPSIZE == num_threads;
-  // precondition: num_waves < 0xffffu;
-
-  // Increment the low 16 bits once, using the lowest active thread.
-  uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
-  bool isLowest = GetLaneId() == lowestActiveThread;
-
-  if (isLowest) {
-    uint32_t load = __atomic_fetch_add(barrier_state, 1,
-                                       __ATOMIC_RELAXED); // commutative
-
-    // Record the number of times the barrier has been passed
-    uint32_t generation = load & 0xffff0000u;
-
-    if ((load & 0x0000ffffu) == (num_waves - 1)) {
-      // Reached num_waves in low bits so this is the last wave.
-      // Set low bits to zero and increment high bits
-      load += 0x00010000u; // wrap is safe
-      load &= 0xffff0000u; // because bits zeroed second
-
-      // Reset the wave counter and release the waiting waves
-      __atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
-    } else {
-      // more waves still to go, spin until generation counter changes
-      do {
-        __builtin_amdgcn_s_sleep(0);
-        load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
-      } while ((load & 0xffff0000u) == generation);
-    }
-  }
-  __atomic_thread_fence(__ATOMIC_RELEASE);
-}
-
-uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
-#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
-
-EXTERN void __kmpc_impl_target_init() {
-  // Don't have global ctors, and shared memory is not zero init
-  __atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
-}
-
-EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
-  pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
-}
-
-namespace {
-uint32_t get_grid_dim(uint32_t n, uint16_t d) {
-  uint32_t q = n / d;
-  return q + (n > q * d);
-}
-uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
-                           uint16_t group_size) {
-  uint32_t r = grid_size - group_id * group_size;
-  return (r < group_size) ? r : group_size;
-}
-} // namespace
-
-EXTERN int __kmpc_get_hardware_num_blocks() {
-  return get_grid_dim(__builtin_amdgcn_grid_size_x(),
-                      __builtin_amdgcn_workgroup_size_x());
-}
-
-EXTERN int __kmpc_get_hardware_num_threads_in_block() {
-  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
-                           __builtin_amdgcn_grid_size_x(),
-                           __builtin_amdgcn_workgroup_size_x());
-}
-
-EXTERN unsigned __kmpc_get_warp_size() {
-  return WARPSIZE;
-}
-
-EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
-EXTERN unsigned GetLaneId() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
-  return __kmpc_get_hardware_num_threads_in_block();
-}
-
-// Atomics
-uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
-  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
-  return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, "");
-}
-uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
-  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
-  uint32_t R;
-  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
-  return R;
-}
-uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
-  (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
-                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-  return Compare;
-}
-
-unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
-                                          unsigned long long Val) {
-  unsigned long long R;
-  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
-  return R;
-}
-unsigned long long __kmpc_atomic_add(unsigned long long *Address,
-                                     unsigned long long Val) {
-  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-// Stub implementations
-// Weak to allow overriding by local versions while comparing different
-// potential implementations
-__attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
-  return nullptr;
-}
-__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
-
-EXTERN
-int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return -1;
-}
-
-EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
-  lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
-  hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
-}
-
-EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
-  return (((uint64_t)hi) << 32) | (uint64_t)lo;
-}
-
-EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
-
-EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
-  // AMDGCN doesn't need to sync threads in a warp
-}
-
-EXTERN void __kmpc_impl_threadfence() {
-  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
-}
-
-EXTERN void __kmpc_impl_threadfence_block() {
-  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
-}
-
-EXTERN void __kmpc_impl_threadfence_system() {
-  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
-}
-
-// Calls to the AMDGCN layer (assuming 1D layout)
-EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
-EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/allocator.h
+++ b/openmp/libomptarget/deviceRTLs/common/allocator.h
@ -1,44 +0,0 @@
-//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Macros for allocating variables in different address spaces.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_ALLOCATOR_H
-#define OMPTARGET_ALLOCATOR_H
-
-#if _OPENMP
-// Follows the pattern in interface.h
-// Clang sema checks this type carefully, needs to closely match that from omp.h
-typedef enum omp_allocator_handle_t {
-  omp_null_allocator = 0,
-  omp_default_mem_alloc = 1,
-  omp_large_cap_mem_alloc = 2,
-  omp_const_mem_alloc = 3,
-  omp_high_bw_mem_alloc = 4,
-  omp_low_lat_mem_alloc = 5,
-  omp_cgroup_mem_alloc = 6,
-  omp_pteam_mem_alloc = 7,
-  omp_thread_mem_alloc = 8,
-  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
-} omp_allocator_handle_t;
-
-#define __PRAGMA(STR) _Pragma(#STR)
-#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
-
-#define SHARED(NAME)                                                           \
-  NAME [[clang::loader_uninitialized]];                                        \
-  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
-
-#define EXTERN_SHARED(NAME)                                                    \
-  NAME;                                                                        \
-  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
-#endif
-
-#endif // OMPTARGET_ALLOCATOR_H
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@ -1,293 +0,0 @@
-//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains debug macros to be used in the application.
-//
-//   Usage guide
-//
-//   PRINT0(flag, str)        : if debug flag is on, print (no arguments)
-//   PRINT(flag, str, args)   : if debug flag is on, print (arguments)
-//   DON(flag)                : return true if debug flag is on
-//
-//   ASSERT(flag, cond, str, args): if test flag is on, test the condition
-//                                  if the condition is false, print str+args
-//          and assert.
-//          CAUTION: cond may be evaluate twice
-//   AON(flag)                     : return true if test flag is on
-//
-//   WARNING(flag, str, args)      : if warning flag is on, print the warning
-//   WON(flag)                     : return true if warning flag is on
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_NVPTX_DEBUG_H_
-#define _OMPTARGET_NVPTX_DEBUG_H_
-
-#include "target_interface.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of debugging
-////////////////////////////////////////////////////////////////////////////////
-
-#define LD_SET_NONE 0ULL /* none */
-#define LD_SET_ALL -1ULL /* all */
-
-// pos 1
-#define LD_SET_LOOP 0x1ULL  /* basic loop */
-#define LD_SET_LOOPD 0x2ULL /* basic loop */
-#define LD_SET_PAR 0x4ULL   /* basic parallel */
-#define LD_SET_PARD 0x8ULL  /* basic parallel */
-
-// pos 2
-#define LD_SET_SYNC 0x10ULL  /* sync info */
-#define LD_SET_SYNCD 0x20ULL /* sync info */
-#define LD_SET_WAIT 0x40ULL  /* state when waiting */
-#define LD_SET_TASK 0x80ULL  /* print task info (high level) */
-
-// pos 3
-#define LD_SET_IO 0x100ULL     /* big region io (excl atomic) */
-#define LD_SET_IOD 0x200ULL    /* big region io (excl atomic) */
-#define LD_SET_ENV 0x400ULL    /* env info */
-#define LD_SET_CANCEL 0x800ULL /* print cancel info */
-
-// pos 4
-#define LD_SET_MEM 0x1000ULL /* malloc / free */
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags to print selected output.
-
-// these are some examples of possible definitions that can be used for
-// debugging.
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
-// on cuda buffer
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
-
-#ifndef OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
-#elif OMPTARGET_NVPTX_DEBUG
-#warning debug is used, not good for measurements
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of asserts
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// available flags
-
-#define LT_SET_NONE 0x0 /* unsafe */
-#define LT_SET_SAFETY                                                          \
-  0x1 /* check malloc type of stuff, input at creation, cheap */
-#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
-#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags
-
-#ifndef OMPTARGET_NVPTX_TEST
-#if OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
-#else
-#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
-#endif
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of warnings
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// available flags
-
-#define LW_SET_ALL -1
-#define LW_SET_NONE 0x0
-#define LW_SET_ENV 0x1
-#define LW_SET_INPUT 0x2
-#define LW_SET_FUSSY 0x4
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags
-
-#if OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
-#else
-#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// implementation for debug
-////////////////////////////////////////////////////////////////////////////////
-
-#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
-#include "common/support.h"
-
-template <typename... Arguments>
-NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)GetBlockIdInKernel(),
-         (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
-         (int)GetLaneId(), parameters...);
-}
-
-#endif
-#if OMPTARGET_NVPTX_TEST
-
-template <typename... Arguments>
-NOINLINE static void check(bool cond, const char *fmt,
-                           Arguments... parameters) {
-  if (!cond) {
-    printf(fmt, (int)GetBlockIdInKernel(),
-           (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
-           (int)GetLaneId(), parameters...);
-    __builtin_trap();
-  }
-}
-
-NOINLINE static void check(bool cond) {
-  if (!cond)
-    __builtin_trap();
-}
-#endif
-
-// set flags that are tested (inclusion properties)
-
-#define LD_ALL (LD_SET_ALL)
-
-#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
-#define LD_LOOPD (LD_SET_LOOPD)
-#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
-#define LD_PARD (LD_SET_PARD)
-
-// pos 2
-#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
-#define LD_SYNCD (LD_SET_SYNCD)
-#define LD_WAIT (LD_SET_WAIT)
-#define LD_TASK (LD_SET_TASK)
-
-// pos 3
-#define LD_IO (LD_SET_IO | LD_SET_IOD)
-#define LD_IOD (LD_SET_IOD)
-#define LD_ENV (LD_SET_ENV)
-#define LD_CANCEL (LD_SET_CANCEL)
-
-// pos 3
-#define LD_MEM (LD_SET_MEM)
-
-// implement
-#if OMPTARGET_NVPTX_DEBUG
-
-#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
-
-#define PRINT0(_flag, _str)                                                    \
-  {                                                                            \
-    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
-      log("<b %2d, t %4d, w %2d, l %2d>: " _str);                              \
-    }                                                                          \
-  }
-
-#define PRINT(_flag, _str, _args...)                                           \
-  {                                                                            \
-    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
-      log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args);                       \
-    }                                                                          \
-  }
-#else
-
-#define DON(_flag) (0)
-#define PRINT0(flag, str)
-#define PRINT(flag, str, _args...)
-
-#endif
-
-// for printing without worrying about precision, pointers...
-#define P64(_x) ((unsigned long long)(_x))
-
-////////////////////////////////////////////////////////////////////////////////
-// early defs for test
-////////////////////////////////////////////////////////////////////////////////
-
-#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
-#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
-#define LT_FUSSY (LT_SET_FUSSY)
-
-#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
-
-#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
-#define ASSERT0(_flag, _cond, _str)                                            \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check(_cond);                                                            \
-    }                                                                          \
-  }
-#define ASSERT(_flag, _cond, _str, _args...)                                   \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check(_cond);                                                            \
-    }                                                                          \
-  }
-
-#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
-
-#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
-#define ASSERT0(_flag, _cond, _str)                                            \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n");       \
-    }                                                                          \
-  }
-#define ASSERT(_flag, _cond, _str, _args...)                                   \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n",        \
-            _args);                                                            \
-    }                                                                          \
-  }
-
-#else
-
-#define TON(_flag) (0)
-#define ASSERT0(_flag, _cond, _str)
-#define ASSERT(_flag, _cond, _str, _args...)
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// early defs for warning
-
-#define LW_ALL (LW_SET_ALL)
-#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
-#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
-#define LW_FUSSY (LW_SET_FUSSY)
-
-#if OMPTARGET_NVPTX_WARNING
-
-#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
-#define WARNING0(_flag, _str)                                                  \
-  {                                                                            \
-    if (WON(_flag)) {                                                          \
-      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str);                      \
-    }                                                                          \
-  }
-#define WARNING(_flag, _str, _args...)                                         \
-  {                                                                            \
-    if (WON(_flag)) {                                                          \
-      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args);               \
-    }                                                                          \
-  }
-
-#else
-
-#define WON(_flag) (0)
-#define WARNING0(_flag, _str)
-#define WARNING(_flag, _str, _args...)
-
-#endif
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen
+++ b/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen
@ -1,405 +0,0 @@
-case 0:
-((void (*)(kmp_int32 *, kmp_int32 *
-))fn)(&global_tid, &bound_tid
-);
-break;
-case 1:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *))fn)(&global_tid, &bound_tid
-, args[0]);
-break;
-case 2:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1]);
-break;
-case 3:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2]);
-break;
-case 4:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-);
-break;
-case 5:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4]);
-break;
-case 6:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5]);
-break;
-case 7:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6]);
-break;
-case 8:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-);
-break;
-case 9:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8]);
-break;
-case 10:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9]);
-break;
-case 11:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10]);
-break;
-case 12:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-);
-break;
-case 13:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12]);
-break;
-case 14:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13]);
-break;
-case 15:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14]);
-break;
-case 16:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-);
-break;
-case 17:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16]);
-break;
-case 18:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17]);
-break;
-case 19:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18]);
-break;
-case 20:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-);
-break;
-case 21:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20]);
-break;
-case 22:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21]);
-break;
-case 23:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22]);
-break;
-case 24:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-);
-break;
-case 25:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24]);
-break;
-case 26:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25]);
-break;
-case 27:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26]);
-break;
-case 28:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-);
-break;
-case 29:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28]);
-break;
-case 30:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28], args[29]);
-break;
-case 31:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28], args[29], args[30]);
-break;
-case 32:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28], args[29], args[30], args[31]
-);
-break;
--- a/openmp/libomptarget/deviceRTLs/common/include/target.h
+++ b/openmp/libomptarget/deviceRTLs/common/include/target.h
@ -1,94 +0,0 @@
-//===-- target.h ---------- OpenMP device runtime target implementation ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target region interfaces are simple interfaces designed to allow middle-end
-// (=LLVM) passes to analyze and transform the code. To achieve good performance
-// it may be required to run the associated passes. However, implementations of
-// this interface shall always provide a correct implementation as close to the
-// user expected code as possible.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
-#define LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
-
-#include <stdint.h>
-
-extern "C" {
-
-/// Forward declaration of the source location identifier "ident".
-typedef struct ident ident_t;
-
-/// The target region _kernel_ interface for GPUs
-///
-/// This deliberatly simple interface provides the middle-end (=LLVM) with
-/// easier means to reason about the semantic of the code and transform it as
-/// well. The runtime calls are therefore also desiged to carry sufficient
-/// information necessary for optimizations.
-///
-///
-/// Intended usage:
-///
-/// \code
-/// void kernel(...) {
-///   ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
-///                                   /* UseGenericStateMachine */ true,
-///                                   /* RequiresFullRuntime */ ... );
-///   if (ThreadKind == -1) {
-///     // User defined kernel code.
-///   }
-///   __kmpc_target_deinit(...);
-/// }
-/// \endcode
-///
-/// Which can be transformed to:
-///
-/// \code
-/// void kernel(...) {
-///   ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
-///                                   /* UseGenericStateMachine */ false,
-///                                   /* RequiresFullRuntime */ ... );
-///   if (ThreadKind == -1) {
-///     // User defined kernel code.
-///   } else {
-///     assume(ThreadKind == ThreadId);
-///     // Custom, kernel-specific state machine code.
-///   }
-///   __kmpc_target_deinit(...);
-/// }
-/// \endcode
-///
-///
-///{
-
-/// Initialization
-///
-/// Must be called by all threads.
-///
-/// \param Ident               Source location identification, can be NULL.
-///
-int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
-                           bool UseGenericStateMachine,
-                           bool RequiresFullRuntime);
-
-/// De-Initialization
-///
-/// Must be called by the main thread in generic mode, can be called by all
-/// threads. Must be called by all threads in SPMD mode.
-///
-/// In non-SPMD, this function releases the workers trapped in a state machine
-/// and also any memory dynamically allocated by the runtime.
-///
-/// \param Ident Source location identification, can be NULL.
-///
-void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
-                          bool RequiresFullRuntime);
-
-///}
-}
-#endif
--- a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
+++ b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
@ -1,102 +0,0 @@
-//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Shuffle function implementations for all supported targets.
-//
-// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
-#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
-
-#include <stdint.h>
-
-#pragma omp declare target
-
-/// External shuffle API
-///
-///{
-
-extern "C" {
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-}
-
-///}
-
-/// Forward declarations
-///
-///{
-extern "C" {
-unsigned GetLaneId();
-unsigned __kmpc_get_warp_size();
-void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
-uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
-}
-///}
-
-/// Fallback implementations of the shuffle sync idiom.
-/// Unavailable at present (would error at link time if used).
-///
-///{
-
-int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane);
-
-int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
-                                   int32_t Width);
-
-///}
-
-/// AMDGCN implementations of the shuffle sync idiom.
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-  int Width = __kmpc_get_warp_size();
-  int Self = GetLaneId();
-  int Index = SrcLane + (Self & ~(Width - 1));
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
-                                          uint32_t LaneDelta, int32_t Width) {
-  int Self = GetLaneId();
-  int Index = Self + LaneDelta;
-  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-#pragma omp end declare variant
-///}
-
-/// NVPTX implementations of the shuffle and shuffle sync idiom.
-///
-///{
-#pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
-
-inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-}
-
-inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
-                                          uint32_t Delta, int32_t Width) {
-  int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
-  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
-#pragma omp end declare variant
-///}
-
-#pragma omp end declare target
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@ -1,282 +0,0 @@
-//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of all library macros, types,
-// and functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_H
-#define OMPTARGET_H
-
-#include "common/allocator.h"
-#include "common/debug.h" // debug
-#include "common/state-queue.h"
-#include "common/support.h"
-#include "interface.h" // interfaces with omp, compiler, and user
-#include "target_impl.h"
-
-#define OMPTARGET_NVPTX_VERSION 1.1
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-#define BARRIER_COUNTER 0
-#define ORDERED_COUNTER 1
-
-// Worker slot type which is initialized with the default worker slot
-// size of 4*32 bytes.
-struct __kmpc_data_sharing_slot {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[DS_Worker_Warp_Slot_Size];
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// task ICV and (implicit & explicit) task state
-
-class omptarget_nvptx_TaskDescr {
-public:
-  // methods for flags
-  INLINE omp_sched_t GetRuntimeSched() const;
-  INLINE void SetRuntimeSched(omp_sched_t sched);
-  INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
-  INLINE int InL2OrHigherParallelRegion() const {
-    return items.flags & TaskDescr_InParL2P;
-  }
-  INLINE int IsParallelConstruct() const {
-    return items.flags & TaskDescr_IsParConstr;
-  }
-  INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
-  // methods for other fields
-  INLINE uint16_t &ThreadId() { return items.threadId; }
-  INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
-  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
-  INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
-    prev = taskDescr;
-  }
-  // init & copy
-  INLINE void InitLevelZeroTaskDescr();
-  INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
-  INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
-  INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
-  INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
-  INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
-  INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
-  INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
-  INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
-                                   uint16_t tid, uint16_t tnum);
-  INLINE void SaveLoopData();
-  INLINE void RestoreLoopData() const;
-
-private:
-  // bits for flags: (6 used, 2 free)
-  //   3 bits (SchedMask) for runtime schedule
-  //   1 bit (InPar) if this thread has encountered one or more parallel region
-  //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
-  //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
-  //   region
-  static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
-  static const uint8_t TaskDescr_InPar = 0x10;
-  static const uint8_t TaskDescr_IsParConstr = 0x20;
-  static const uint8_t TaskDescr_InParL2P = 0x40;
-
-  struct SavedLoopDescr_items {
-    int64_t loopUpperBound;
-    int64_t nextLowerBound;
-    int64_t chunk;
-    int64_t stride;
-    kmp_sched_t schedule;
-  } loopData;
-
-  struct TaskDescr_items {
-    uint8_t flags; // 6 bit used (see flag above)
-    uint8_t unused;
-    uint16_t threadId;         // thread id
-    uint64_t runtimeChunkSize; // runtime chunk size
-  } items;
-  omptarget_nvptx_TaskDescr *prev;
-};
-
-// build on kmp
-typedef struct omptarget_nvptx_ExplicitTaskDescr {
-  omptarget_nvptx_TaskDescr
-      taskDescr; // omptarget_nvptx task description (must be first)
-  kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
-} omptarget_nvptx_ExplicitTaskDescr;
-
-////////////////////////////////////////////////////////////////////////////////
-// Descriptor of a parallel region (worksharing in general)
-
-class omptarget_nvptx_WorkDescr {
-
-public:
-  // access to data
-  INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
-
-private:
-  omptarget_nvptx_TaskDescr masterTaskICV;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-class omptarget_nvptx_TeamDescr {
-public:
-  // access to data
-  INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
-    return &levelZeroTaskDescr;
-  }
-  INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
-    return workDescrForActiveParallel;
-  }
-
-  // init
-  INLINE void InitTeamDescr();
-
-  INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
-    worker_rootS[wid].DataEnd =
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
-    // We currently do not have a next slot.
-    worker_rootS[wid].Next = 0;
-    worker_rootS[wid].Prev = 0;
-    worker_rootS[wid].PrevSlotStackPtr = 0;
-    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
-  }
-
-private:
-  omptarget_nvptx_TaskDescr
-      levelZeroTaskDescr; // icv for team master initial thread
-  omptarget_nvptx_WorkDescr
-      workDescrForActiveParallel; // one, ONLY for the active par
-
-  ALIGN(16)
-  __kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number];
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// thread private data (struct of arrays for better coalescing)
-// tid refers here to the global thread id
-// do not support multiple concurrent kernel a this time
-class omptarget_nvptx_ThreadPrivateContext {
-public:
-  // task
-  INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
-    return &levelOneTaskDescr[tid];
-  }
-  INLINE void SetTopLevelTaskDescr(int tid,
-                                   omptarget_nvptx_TaskDescr *taskICV) {
-    topTaskDescr[tid] = taskICV;
-  }
-  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
-  // schedule (for dispatch)
-  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
-  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
-  INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
-  INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
-  INLINE int64_t &Stride(int tid) { return stride[tid]; }
-
-  INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
-
-  INLINE void InitThreadPrivateContext(int tid);
-  INLINE uint64_t &Cnt() { return cnt; }
-
-private:
-  // team context for this team
-  omptarget_nvptx_TeamDescr teamContext;
-  // task ICV for implicit threads in the only parallel region
-  omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
-  // pointer where to find the current task ICV (top of the stack)
-  omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
-  // schedule (for dispatch)
-  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
-  int64_t chunk[MAX_THREADS_PER_TEAM];
-  int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
-  // state for dispatch with dyn/guided OR static (never use both at a time)
-  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
-  int64_t stride[MAX_THREADS_PER_TEAM];
-  uint64_t cnt;
-};
-
-/// Memory manager for statically allocated memory.
-class omptarget_nvptx_SimpleMemoryManager {
-private:
-  struct MemDataTy {
-    volatile unsigned keys[OMP_STATE_COUNT];
-  } MemData[MAX_SM] ALIGN(128);
-
-  INLINE static uint32_t hash(unsigned key) {
-    return key & (OMP_STATE_COUNT - 1);
-  }
-
-public:
-  INLINE void Release();
-  INLINE const void *Acquire(const void *buf, size_t size);
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// global data tables
-////////////////////////////////////////////////////////////////////////////////
-
-extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
-extern uint32_t EXTERN_SHARED(usedMemIdx);
-extern uint32_t EXTERN_SHARED(usedSlotIdx);
-#if _OPENMP
-extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
-#else
-extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
-#endif
-extern uint16_t EXTERN_SHARED(threadLimit);
-extern uint16_t EXTERN_SHARED(threadsInTeam);
-extern uint16_t EXTERN_SHARED(nThreads);
-extern omptarget_nvptx_ThreadPrivateContext *
-    EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
-
-extern int8_t EXTERN_SHARED(execution_param);
-extern void *EXTERN_SHARED(ReductionScratchpadPtr);
-
-////////////////////////////////////////////////////////////////////////////////
-// work function (outlined parallel/simd functions) and arguments.
-// needed for L1 parallelism only.
-////////////////////////////////////////////////////////////////////////////////
-
-typedef void *omptarget_nvptx_WorkFn;
-extern omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
-
-////////////////////////////////////////////////////////////////////////////////
-// get private data structures
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
-INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
-INLINE omptarget_nvptx_TaskDescr *
-getMyTopTaskDescriptor(bool isSPMDExecutionMode);
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
-
-////////////////////////////////////////////////////////////////////////////////
-// inlined implementation
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
-INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
-INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
-INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
-
-#include "common/omptargeti.h"
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@ -1,223 +0,0 @@
-//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of all library macros, types,
-// and functions.
-//
-//===----------------------------------------------------------------------===//
-
-////////////////////////////////////////////////////////////////////////////////
-// Task Descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
-  // sched starts from 1..4; encode it as 0..3; so add 1 here
-  uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
-  return (omp_sched_t)rc;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
-  // sched starts from 1..4; encode it as 0..3; so sub 1 here
-  uint8_t val = ((uint8_t)sched) - 1;
-  // clear current sched
-  items.flags &= ~TaskDescr_SchedMask;
-  // set new sched
-  items.flags |= val;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
-  // slow method
-  // flag:
-  //   default sched is static,
-  //   dyn is off (unused now anyway, but may need to sample from host ?)
-  //   not in parallel
-
-  items.flags = 0;
-  items.threadId = 0;         // is master
-  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
-}
-
-// This is called when all threads are started together in SPMD mode.
-// OMP directives include target parallel, target distribute parallel for, etc.
-INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
-    omptarget_nvptx_TaskDescr *parentTaskDescr) {
-  // slow method
-  // flag:
-  //   default sched is static,
-  //   dyn is off (unused now anyway, but may need to sample from host ?)
-  //   in L1 parallel
-
-  items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-  items.threadId =
-      __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
-                                                // called for 1st level)
-  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
-  prev = parentTaskDescr;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyData(
-    omptarget_nvptx_TaskDescr *sourceTaskDescr) {
-  items = sourceTaskDescr->items;
-}
-
-INLINE void
-omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
-  CopyData(sourceTaskDescr);
-  prev = sourceTaskDescr->prev;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyParent(
-    omptarget_nvptx_TaskDescr *parentTaskDescr) {
-  CopyData(parentTaskDescr);
-  prev = parentTaskDescr;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
-    omptarget_nvptx_TaskDescr *parentTaskDescr) {
-  CopyParent(parentTaskDescr);
-  items.flags = items.flags & ~TaskDescr_IsParConstr;
-  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
-    omptarget_nvptx_TaskDescr *masterTaskDescr) {
-  CopyParent(masterTaskDescr);
-  // overwrite specific items;
-  items.flags |=
-      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
-    omptarget_nvptx_TaskDescr *workTaskDescr) {
-  Copy(workTaskDescr);
-  //
-  // overwrite specific items;
-  //
-  // The threadID should be __kmpc_get_hardware_thread_id_in_block() %
-  // GetMasterThreadID(). This is so that the serial master (first lane in the
-  // master warp) gets a threadId of 0. However, we know that this function is
-  // always called in a parallel region where only workers are active.  The
-  // serial master thread never enters this region.  When a parallel region is
-  // executed serially, the threadId is set to 0 elsewhere and the
-  // kmpc_serialized_* functions are called, which never activate this region.
-  items.threadId =
-      __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
-                                                // called for 1st level)
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
-    omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
-  CopyParent(parentTaskDescr);
-  items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
-  items.threadId = tid;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
-  loopData.loopUpperBound =
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
-  loopData.nextLowerBound =
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
-  loopData.schedule =
-      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
-  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
-  loopData.stride =
-      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
-}
-
-INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
-  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
-  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
-      loopData.loopUpperBound;
-  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
-      loopData.nextLowerBound;
-  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
-      loopData.stride;
-  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
-      loopData.schedule;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Thread Private Context
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omptarget_nvptx_TaskDescr *
-omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
-  ASSERT0(
-      LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
-      "Getting top level, tid is larger than allocated data structure size");
-  return topTaskDescr[tid];
-}
-
-INLINE void
-omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
-  // levelOneTaskDescr is init when starting the parallel region
-  // top task descr is NULL (team master version will be fixed separately)
-  topTaskDescr[tid] = NULL;
-  // the following don't need to be init here; they are init when using dyn
-  // sched
-  // current_Event, events_Number, chunk, num_Iterations, schedule
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Team Descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
-  levelZeroTaskDescr.InitLevelZeroTaskDescr();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Get private data structure for thread
-////////////////////////////////////////////////////////////////////////////////
-
-// Utility routines for CUDA threads
-INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
-  return omptarget_nvptx_threadPrivateContext->TeamContext();
-}
-
-INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
-  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  return currTeamDescr.WorkDescr();
-}
-
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
-  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-}
-
-INLINE omptarget_nvptx_TaskDescr *
-getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
-  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory management runtime functions.
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
-  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
-          "SlotIdx is too big or uninitialized.");
-  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
-          "MemIdx is too big or uninitialized.");
-  MemDataTy &MD = MemData[usedSlotIdx];
-  __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
-}
-
-INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
-                                                                size_t size) {
-  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
-          "SlotIdx is too big or uninitialized.");
-  const unsigned sm = usedSlotIdx;
-  MemDataTy &MD = MemData[sm];
-  unsigned i = hash(GetBlockIdInKernel());
-  while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
-    i = hash(i + 1);
-  }
-  usedSlotIdx = sm;
-  usedMemIdx = i;
-  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
-}
--- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
@ -1,31 +0,0 @@
-//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Interface to be used in the implementation of OpenMP cancel.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "interface.h"
-
-EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
-                                        int32_t cancelVal) {
-  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
-  // disabled
-  return 0;
-}
-
-EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
-                             int32_t cancelVal) {
-  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
-  // disabled
-  return 0;
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
@ -1,31 +0,0 @@
-//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of critical with KMPC interface
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "interface.h"
-
-EXTERN
-void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
-                     kmp_CriticalName *lck) {
-  PRINT0(LD_IO, "call to kmpc_critical()\n");
-  omp_set_lock((omp_lock_t *)lck);
-}
-
-EXTERN
-void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
-                         kmp_CriticalName *lck) {
-  PRINT0(LD_IO, "call to kmpc_end_critical()\n");
-  omp_unset_lock((omp_lock_t *)lck);
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@ -1,194 +0,0 @@
-//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of data sharing environments
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target/shuffle.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Runtime functions for trunk data sharing scheme.
-////////////////////////////////////////////////////////////////////////////////
-
-static constexpr unsigned MinBytes = 8;
-
-static constexpr unsigned Alignment = 8;
-
-/// External symbol to access dynamic shared memory.
-extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
-#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
-
-EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }
-
-EXTERN void *llvm_omp_get_dynamic_shared() {
-  return __kmpc_get_dynamic_shared();
-}
-
-template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
-struct alignas(32) ThreadStackTy {
-  static constexpr unsigned BytesPerThread = BPerThread;
-  static constexpr unsigned NumThreads = NThreads;
-  static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
-
-  unsigned char Data[NumThreads][BytesPerThread];
-  unsigned char Usage[NumThreads];
-};
-
-[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
-#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
-
-[[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
-                                              MAX_THREADS_PER_TEAM / 4>
-    WorkerSharedStack;
-#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
-
-EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
-  size_t AlignedBytes = Bytes + (Bytes % MinBytes);
-  int TID = __kmpc_get_hardware_thread_id_in_block();
-  if (__kmpc_is_generic_main_thread(TID)) {
-    // Main thread alone, use shared memory if space is available.
-    if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
-      void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
-      MainSharedStack.Usage[0] += AlignedBytes;
-      return Ptr;
-    }
-  } else if (TID < WorkerSharedStack.NumThreads) {
-    if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
-      void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
-      WorkerSharedStack.Usage[TID] += AlignedBytes;
-      return Ptr;
-    }
-  }
-  // Fallback to malloc
-  return SafeMalloc(Bytes, "AllocGlobalFallback");
-}
-
-EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
-  size_t AlignedBytes = Bytes + (Bytes % MinBytes);
-  int TID = __kmpc_get_hardware_thread_id_in_block();
-  if (__kmpc_is_generic_main_thread(TID)) {
-    if (Ptr >= &MainSharedStack.Data[0][0] &&
-        Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
-      MainSharedStack.Usage[0] -= AlignedBytes;
-      return;
-    }
-  } else if (TID < WorkerSharedStack.NumThreads) {
-    if (Ptr >= &WorkerSharedStack.Data[0][0] &&
-        Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
-      int TID = __kmpc_get_hardware_thread_id_in_block();
-      WorkerSharedStack.Usage[TID] -= AlignedBytes;
-      return;
-    }
-  }
-  SafeFree(Ptr, "FreeGlobalFallback");
-}
-
-EXTERN void __kmpc_data_sharing_init_stack() {
-  for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
-    MainSharedStack.Usage[i] = 0;
-  for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
-    WorkerSharedStack.Usage[i] = 0;
-}
-
-/// Allocate storage in shared memory to communicate arguments from the main
-/// thread to the workers in generic mode. If we exceed
-/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
-#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64
-
-[[clang::loader_uninitialized]] static void
-    *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-#pragma omp allocate(SharedMemVariableSharingSpace)                            \
-    allocator(omp_pteam_mem_alloc)
-[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
-#pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
-    allocator(omp_pteam_mem_alloc)
-
-// Begin a data sharing context. Maintain a list of references to shared
-// variables. This list of references to shared variables will be passed
-// to one or more threads.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
-  if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
-    SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
-  } else {
-    SharedMemVariableSharingSpacePtr =
-        (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
-  }
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-// End a data sharing context. There is no need to have a list of refs
-// to shared variables because the context in which those variables were
-// shared has now ended. This should clean-up the list of references only
-// without affecting the actual global storage of the variables.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_end_sharing_variables() {
-  if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
-    SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
-}
-
-// This function will return a list of references to global variables. This
-// is how the workers will get a reference to the globalized variable. The
-// members of this list will be passed to the outlined parallel function
-// preserving the order.
-// Called by all workers.
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-// This function is used to init static memory manager. This manager is used to
-// manage statically allocated global memory. This memory is allocated by the
-// compiler and used to correctly implement globalization of the variables in
-// target, teams and distribute regions.
-EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
-                                          const void *buf, size_t size,
-                                          int16_t is_shared,
-                                          const void **frame) {
-  if (is_shared) {
-    *frame = buf;
-    return;
-  }
-  if (isSPMDExecutionMode) {
-    if (__kmpc_get_hardware_thread_id_in_block() == 0) {
-      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
-    }
-    __kmpc_impl_syncthreads();
-    return;
-  }
-  ASSERT0(LT_FUSSY,
-          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
-          "Must be called only in the target master thread.");
-  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
-  __kmpc_impl_threadfence();
-}
-
-EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
-                                              int16_t is_shared) {
-  if (is_shared)
-    return;
-  if (isSPMDExecutionMode) {
-    __kmpc_impl_syncthreads();
-    if (__kmpc_get_hardware_thread_id_in_block() == 0) {
-      omptarget_nvptx_simpleMemoryManager.Release();
-    }
-    return;
-  }
-  __kmpc_impl_threadfence();
-  ASSERT0(LT_FUSSY,
-          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
-          "Must be called only in the target master thread.");
-  omptarget_nvptx_simpleMemoryManager.Release();
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@ -1,359 +0,0 @@
-//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the OpenMP runtime functions that can be
-// invoked by the user in an OpenMP region
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-EXTERN double omp_get_wtick(void) {
-  double rc = __kmpc_impl_get_wtick();
-  PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
-  return rc;
-}
-
-EXTERN double omp_get_wtime(void) {
-  double rc = __kmpc_impl_get_wtime();
-  PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_num_threads(int num) {
-  // Ignore it for SPMD mode.
-  if (__kmpc_is_spmd_exec_mode())
-    return;
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-  PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
-  if (num <= 0) {
-    WARNING0(LW_INPUT, "expected positive num; ignore\n");
-  } else if (parallelLevel[GetWarpId()] == 0) {
-    nThreads = num;
-  }
-}
-
-EXTERN int omp_get_num_threads(void) {
-  int rc = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
-  PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_max_threads(void) {
-  if (parallelLevel[GetWarpId()] > 0)
-    // We're already in parallel region.
-    return 1; // default is 1 thread avail
-  // Not currently in a parallel region, return what was set.
-  int rc = 1;
-  if (parallelLevel[GetWarpId()] == 0)
-    rc = nThreads;
-  ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
-  PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_thread_limit(void) {
-  if (__kmpc_is_spmd_exec_mode())
-    return __kmpc_get_hardware_num_threads_in_block();
-  int rc = threadLimit;
-  PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_thread_num() {
-  int rc = GetOmpThreadId();
-  PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_num_procs(void) {
-  int rc = GetNumberOfProcsInDevice(__kmpc_is_spmd_exec_mode());
-  PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_in_parallel(void) {
-  int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
-  PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_in_final(void) {
-  // treat all tasks as final... Specs may expect runtime to keep
-  // track more precisely if a task was actively set by users... This
-  // is not explicitly specified; will treat as if runtime can
-  // actively decide to put a non-final task into a final one.
-  int rc = 1;
-  PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_dynamic(int flag) {
-  PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
-}
-
-EXTERN int omp_get_dynamic(void) {
-  int rc = 0;
-  PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_nested(int flag) {
-  PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
-        flag);
-}
-
-EXTERN int omp_get_nested(void) {
-  int rc = 0;
-  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_max_active_levels(int level) {
-  PRINT(LD_IO,
-        "call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
-        level);
-}
-
-EXTERN int omp_get_max_active_levels(void) {
-  int rc = 1;
-  PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_level(void) {
-  int level = __kmpc_parallel_level();
-  PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
-  return level;
-}
-
-EXTERN int omp_get_active_level(void) {
-  int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
-  PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
-  return level;
-}
-
-EXTERN int omp_get_ancestor_thread_num(int level) {
-  if (__kmpc_is_spmd_exec_mode())
-    return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0;
-  int rc = -1;
-  // If level is 0 or all parallel regions are not active - return 0.
-  unsigned parLevel = parallelLevel[GetWarpId()];
-  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
-    int totLevel = omp_get_level();
-    if (level <= totLevel) {
-      omptarget_nvptx_TaskDescr *currTaskDescr =
-          getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
-      int steps = totLevel - level;
-      PRINT(LD_IO, "backtrack %d steps\n", steps);
-      ASSERT0(LT_FUSSY, currTaskDescr,
-              "do not expect fct to be called in a non-active thread");
-      do {
-        if (DON(LD_IOD)) {
-          // print current state
-          omp_sched_t sched = currTaskDescr->GetRuntimeSched();
-          PRINT(LD_ALL,
-                "task descr %s %d: %s, in par %d, rt sched %d,"
-                " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
-                "ancestor", steps,
-                (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
-                (int)currTaskDescr->InParallelRegion(), (int)sched,
-                currTaskDescr->RuntimeChunkSize(),
-                (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
-                (int)nThreads);
-        }
-
-        if (currTaskDescr->IsParallelConstruct()) {
-          // found the level
-          if (!steps) {
-            rc = currTaskDescr->ThreadId();
-            break;
-          }
-          steps--;
-        }
-        currTaskDescr = currTaskDescr->GetPrevTaskDescr();
-      } while (currTaskDescr);
-      ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
-    }
-  } else if (level == 0 ||
-             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= parLevel) ||
-             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
-    rc = 0;
-  }
-  PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
-        rc)
-  return rc;
-}
-
-EXTERN int omp_get_team_size(int level) {
-  if (__kmpc_is_spmd_exec_mode())
-    return level == 1 ? __kmpc_get_hardware_num_threads_in_block() : 1;
-  int rc = -1;
-  unsigned parLevel = parallelLevel[GetWarpId()];
-  // If level is 0 or all parallel regions are not active - return 1.
-  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
-    rc = threadsInTeam;
-  } else if (level == 0 ||
-             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= parLevel) ||
-             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
-    rc = 1;
-  }
-  PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
-  return rc;
-}
-
-EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
-            "Expected SPMD mode only with uninitialized runtime.");
-    *kind = omp_sched_static;
-    *modifier = 1;
-  } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
-    *kind = currTaskDescr->GetRuntimeSched();
-    *modifier = currTaskDescr->RuntimeChunkSize();
-  }
-  PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
-        (int)*kind, *modifier);
-}
-
-EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
-  PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
-        modifier);
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
-            "Expected SPMD mode only with uninitialized runtime.");
-    return;
-  }
-  if (kind >= omp_sched_static && kind < omp_sched_auto) {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
-    currTaskDescr->SetRuntimeSched(kind);
-    currTaskDescr->RuntimeChunkSize() = modifier;
-    PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
-          (int)currTaskDescr->GetRuntimeSched(),
-          currTaskDescr->RuntimeChunkSize());
-  }
-}
-
-EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
-  PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
-  return omp_proc_bind_true;
-}
-
-EXTERN int omp_get_num_places(void) {
-  PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_place_num_procs(int place_num) {
-  PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
-  return 0;
-}
-
-EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
-  PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
-}
-
-EXTERN int omp_get_place_num(void) {
-  PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_partition_num_places(void) {
-  PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
-  return 0;
-}
-
-EXTERN void omp_get_partition_place_nums(int *place_nums) {
-  PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
-}
-
-EXTERN int omp_get_cancellation(void) {
-  int rc = 0;
-  PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_default_device(int deviceId) {
-  PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
-}
-
-EXTERN int omp_get_default_device(void) {
-  PRINT0(LD_IO,
-         "call omp_get_default_device() is undef on device, returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_num_devices(void) {
-  PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_num_teams(void) {
-  int rc = GetNumberOfOmpTeams();
-  PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_team_num() {
-  int rc = GetOmpTeamId();
-  PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
-  return rc;
-}
-
-// Unspecified on the device.
-EXTERN int omp_get_initial_device(void) {
-  PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
-  return 0;
-}
-
-// Unused for now.
-EXTERN int omp_get_max_task_priority(void) {
-  PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
-  return 0;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// locks
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void omp_init_lock(omp_lock_t *lock) {
-  __kmpc_impl_init_lock(lock);
-  PRINT0(LD_IO, "call omp_init_lock()\n");
-}
-
-EXTERN void omp_destroy_lock(omp_lock_t *lock) {
-  __kmpc_impl_destroy_lock(lock);
-  PRINT0(LD_IO, "call omp_destroy_lock()\n");
-}
-
-EXTERN void omp_set_lock(omp_lock_t *lock) {
-  __kmpc_impl_set_lock(lock);
-  PRINT0(LD_IO, "call omp_set_lock()\n");
-}
-
-EXTERN void omp_unset_lock(omp_lock_t *lock) {
-  __kmpc_impl_unset_lock(lock);
-  PRINT0(LD_IO, "call omp_unset_lock()\n");
-}
-
-EXTERN int omp_test_lock(omp_lock_t *lock) {
-  int rc = __kmpc_impl_test_lock(lock);
-  PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
-  return rc;
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@ -1,813 +0,0 @@
-//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target/shuffle.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-// template class that encapsulate all the helper functions
-//
-// T is loop iteration type (32 | 64)  (unsigned | signed)
-// ST is the signed version of T
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
-public:
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling with chunk
-
-  // Generic implementation of OMP loop scheduling with static policy
-  /*! \brief Calculate initial bounds for static loop and stride
-   *  @param[in] loc location in code of the call (not used here)
-   *  @param[in] global_tid global thread id
-   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
-   *  @param[in] plastiter pointer to last iteration
-   *  @param[in,out] pointer to loop lower bound. it will contain value of
-   *  lower bound of first chunk
-   *  @param[in,out] pointer to loop upper bound. It will contain value of
-   *  upper bound of first chunk
-   *  @param[in,out] pointer to loop stride. It will contain value of stride
-   *  between two successive chunks executed by the same thread
-   *  @param[in] loop increment bump
-   *  @param[in] chunk size
-   */
-
-  // helper function for static chunk
-  INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
-                                    ST chunk, T entityId, T numberOfEntities) {
-    // each thread executes multiple chunks all of the same size, except
-    // the last one
-
-    // distance between two successive chunks
-    stride = numberOfEntities * chunk;
-    lb = lb + entityId * chunk;
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    // Say ub' is the begining of the last chunk. Then who ever has a
-    // lower bound plus a multiple of the increment equal to ub' is
-    // the last one.
-    T beginingLastChunk = inputUb - (inputUb % chunk);
-    last = ((beginingLastChunk - lb) % stride) == 0;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling without chunk
-
-  // helper function for static no chunk
-  INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
-                                      ST &chunk, T entityId,
-                                      T numberOfEntities) {
-    // No chunk size specified.  Each thread or warp gets at most one
-    // chunk; chunks are all almost of equal size
-    T loopSize = ub - lb + 1;
-
-    chunk = loopSize / numberOfEntities;
-    T leftOver = loopSize - chunk * numberOfEntities;
-
-    if (entityId < leftOver) {
-      chunk++;
-      lb = lb + entityId * chunk;
-    } else {
-      lb = lb + entityId * chunk + leftOver;
-    }
-
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    last = lb <= inputUb && inputUb <= ub;
-    stride = loopSize; // make sure we only do 1 chunk per warp
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for Static Init
-
-  INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
-                                     int32_t *plastiter, T *plower, T *pupper,
-                                     ST *pstride, ST chunk,
-                                     bool IsSPMDExecutionMode) {
-    // When IsRuntimeUninitialized is true, we assume that the caller is
-    // in an L0 parallel region and that all worker threads participate.
-
-    // Assume we are in teams region or that we use a single block
-    // per target region
-    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
-
-    // All warps that are in excess of the maximum requested, do
-    // not execute the loop
-    PRINT(LD_LOOP,
-          "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
-          "%d, num tids %d\n",
-          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
-          (int)numberOfActiveOMPThreads);
-    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
-            "current thread is not needed here; error");
-
-    // copy
-    int lastiter = 0;
-    T lb = *plower;
-    T ub = *pupper;
-    ST stride = *pstride;
-    // init
-    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
-    case kmp_sched_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        break;
-      }
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_balanced_chunk: {
-      if (chunk > 0) {
-        // round up to make sure the chunk is enough to cover all iterations
-        T tripCount = ub - lb + 1; // +1 because ub is inclusive
-        T span = (tripCount + numberOfActiveOMPThreads - 1) /
-                 numberOfActiveOMPThreads;
-        // perform chunk adjustment
-        chunk = (span + chunk - 1) & ~(chunk - 1);
-
-        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-        T oldUb = ub;
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        if (ub > oldUb)
-          ub = oldUb;
-        break;
-      }
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-      break;
-    }
-    case kmp_sched_distr_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
-                       GetNumberOfOmpTeams());
-        break;
-      } // note: if chunk <=0, use nochunk
-    }
-    case kmp_sched_distr_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
-                       GetNumberOfOmpTeams());
-      break;
-    }
-    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
-      ForStaticChunk(lastiter, lb, ub, stride, chunk,
-                     numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
-                     GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
-      break;
-    }
-    default: {
-      ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
-      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
-            (int)schedtype);
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                     numberOfActiveOMPThreads);
-      break;
-    }
-    }
-    // copy back
-    *plastiter = lastiter;
-    *plower = lb;
-    *pupper = ub;
-    *pstride = stride;
-    PRINT(LD_LOOP,
-          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
-          "%d\n",
-          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
-          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
-          (int)lastiter);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch Init
-
-  INLINE static int OrderedSchedule(kmp_sched_t schedule) {
-    return schedule >= kmp_sched_ordered_first &&
-           schedule <= kmp_sched_ordered_last;
-  }
-
-  INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
-                                   kmp_sched_t schedule, T lb, T ub, ST st,
-                                   ST chunk) {
-    if (isRuntimeUninitialized()) {
-      // In SPMD mode no need to check parallelism level - dynamic scheduling
-      // may appear only in L2 parallel regions with lightweight runtime.
-      ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
-      return;
-    }
-    int tid = GetLogicalThreadIdInBlock();
-    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
-    T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
-    T tripCount = ub - lb + 1; // +1 because ub is inclusive
-    ASSERT0(LT_FUSSY, threadId < tnum,
-            "current thread is not needed here; error");
-
-    /* Currently just ignore the monotonic and non-monotonic modifiers
-     * (the compiler isn't producing them * yet anyway).
-     * When it is we'll want to look at them somewhere here and use that
-     * information to add to our schedule choice. We shouldn't need to pass
-     * them on, they merely affect which schedule we can legally choose for
-     * various dynamic cases. (In particular, whether or not a stealing scheme
-     * is legal).
-     */
-    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
-    // Process schedule.
-    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule))
-        __kmpc_barrier(loc, threadId);
-      PRINT(LD_LOOP,
-            "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
-            (long)tnum, (long long)tripCount, (int)schedule);
-      schedule = kmp_sched_static_chunk;
-      chunk = tripCount; // one thread gets the whole loop
-    } else if (schedule == kmp_sched_runtime) {
-      // process runtime
-      omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
-      chunk = currTaskDescr->RuntimeChunkSize();
-      switch (rtSched) {
-      case omp_sched_static: {
-        if (chunk > 0)
-          schedule = kmp_sched_static_chunk;
-        else
-          schedule = kmp_sched_static_nochunk;
-        break;
-      }
-      case omp_sched_auto: {
-        schedule = kmp_sched_static_chunk;
-        chunk = 1;
-        break;
-      }
-      case omp_sched_dynamic:
-      case omp_sched_guided: {
-        schedule = kmp_sched_dynamic;
-        break;
-      }
-      }
-      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-    } else if (schedule == kmp_sched_auto) {
-      schedule = kmp_sched_static_chunk;
-      chunk = 1;
-      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-    } else {
-      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-      ASSERT(LT_FUSSY,
-             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-             "unknown schedule %d & chunk %lld\n", (int)schedule,
-             (long long)chunk);
-    }
-
-    // init schedules
-    if (schedule == kmp_sched_static_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_static_balanced_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      // round up to make sure the chunk is enough to cover all iterations
-      T span = (tripCount + tnum - 1) / tnum;
-      // perform chunk adjustment
-      chunk = (span + chunk - 1) & ~(chunk - 1);
-
-      T oldUb = ub;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-      if (ub > oldUb)
-        ub = oldUb;
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_static_nochunk) {
-      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      // save data
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      if (chunk < 1)
-        chunk = 1;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      __kmpc_barrier(loc, threadId);
-      if (tid == 0) {
-        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
-        __kmpc_impl_threadfence_block();
-      }
-      __kmpc_barrier(loc, threadId);
-      PRINT(LD_LOOP,
-            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
-            ", chunk %" PRIu64 "\n",
-            (int)tnum,
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid));
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch next
-
-  INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
-                                 int leader) {
-    uint32_t lo, hi;
-    __kmpc_impl_unpack(val, lo, hi);
-    hi = __kmpc_impl_shfl_sync(active, hi, leader);
-    lo = __kmpc_impl_shfl_sync(active, lo, leader);
-    return __kmpc_impl_pack(lo, hi);
-  }
-
-  INLINE static uint64_t NextIter() {
-    __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
-    uint32_t leader = __kmpc_impl_ffs(active) - 1;
-    uint32_t change = __kmpc_impl_popc(active);
-    __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
-    unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
-    uint64_t warp_res;
-    if (rank == 0) {
-      warp_res = __kmpc_atomic_add(
-          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
-          (unsigned long long)change);
-    }
-    warp_res = Shuffle(active, warp_res, leader);
-    return warp_res + rank;
-  }
-
-  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
-                                     T loopLowerBound, T loopUpperBound) {
-    T N = NextIter();
-    lb = loopLowerBound + N * chunkSize;
-    ub = lb + chunkSize - 1; // Clang uses i <= ub
-
-    // 3 result cases:
-    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
-    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
-    //  NOT_FINISHED
-    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    // a.
-    if (lb <= loopUpperBound && ub < loopUpperBound) {
-      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
-            (long long)lb, (long long)ub, (long long)loopUpperBound);
-      return NOT_FINISHED;
-    }
-    // b.
-    if (lb <= loopUpperBound) {
-      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
-            (long long)lb, (long long)ub, (long long)loopUpperBound);
-      ub = loopUpperBound;
-      return LAST_CHUNK;
-    }
-    // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 2;
-    ub = loopUpperBound + 1;
-    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
-          (long long)ub, (long long)loopUpperBound);
-    return FINISHED;
-  }
-
-  INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
-                                  T *plower, T *pupper, ST *pstride) {
-    if (isRuntimeUninitialized()) {
-      // In SPMD mode no need to check parallelism level - dynamic scheduling
-      // may appear only in L2 parallel regions with lightweight runtime.
-      ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
-      if (*plast)
-        return DISPATCH_FINISHED;
-      *plast = 1;
-      return DISPATCH_NOTFINISHED;
-    }
-    // ID of a thread in its own warp
-
-    // automatically selects thread or warp ID based on selected implementation
-    int tid = GetLogicalThreadIdInBlock();
-    ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
-            "current thread is not needed here; error");
-    // retrieve schedule
-    kmp_sched_t schedule =
-        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
-
-    // xxx reduce to one
-    if (schedule == kmp_sched_static_chunk ||
-        schedule == kmp_sched_static_nochunk) {
-      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
-      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
-      // finished?
-      if (myLb > ub) {
-        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
-              (long long)myLb, (long long)ub);
-        return DISPATCH_FINISHED;
-      }
-      // not finished, save current bounds
-      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
-      *plower = myLb;
-      T myUb = myLb + chunk - 1; // Clang uses i <= ub
-      if (myUb > ub)
-        myUb = ub;
-      *pupper = myUb;
-      *plast = (int32_t)(myUb == ub);
-
-      // increment next lower bound by the stride
-      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
-      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
-            (long long)*plower, (long long)*pupper);
-      return DISPATCH_NOTFINISHED;
-    }
-    ASSERT0(LT_FUSSY,
-            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-            "bad sched");
-    T myLb, myUb;
-    int finished = DynamicNextChunk(
-        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
-    if (finished == FINISHED)
-      return DISPATCH_FINISHED;
-
-    // not finished (either not finished or last chunk)
-    *plast = (int32_t)(finished == LAST_CHUNK);
-    *plower = myLb;
-    *pupper = myUb;
-    *pstride = 1;
-
-    PRINT(LD_LOOP,
-          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
-          "last %d\n",
-          (int)GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
-          (int)GetNumberOfWorkersInTeam(), (long long)*plower,
-          (long long)*pupper, (long long)*pstride, (int)*plast);
-    return DISPATCH_NOTFINISHED;
-  }
-
-  INLINE static void dispatch_fini() {
-    // nothing
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // end of template class that encapsulate all the helper functions
-  ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// init
-EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
-                                   int32_t schedule, int32_t lb, int32_t ub,
-                                   int32_t st, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
-                                    int32_t schedule, uint32_t lb, uint32_t ub,
-                                    int32_t st, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
-                                   int32_t schedule, int64_t lb, int64_t ub,
-                                   int64_t st, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
-                                    int32_t schedule, uint64_t lb, uint64_t ub,
-                                    int64_t st, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-// next
-EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                  int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
-  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                   uint32_t *p_lb, uint32_t *p_ub,
-                                   int32_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
-  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                  int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
-  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                   uint64_t *p_lb, uint64_t *p_ub,
-                                   int64_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
-  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-// fini
-EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
-                                            int32_t schedtype,
-                                            int32_t *plastiter, int32_t *plower,
-                                            int32_t *pupper, int32_t *pstride,
-                                            int32_t incr, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_distribute_static_init_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t schedtype,
-                                             int32_t *plastiter,
-                                             uint32_t *plower, uint32_t *pupper,
-                                             int32_t *pstride, int32_t incr,
-                                             int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_distribute_static_init_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
-                                            int32_t schedtype,
-                                            int32_t *plastiter, int64_t *plower,
-                                            int64_t *pupper, int64_t *pstride,
-                                            int64_t incr, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_distribute_static_init_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t schedtype,
-                                             int32_t *plastiter,
-                                             uint64_t *plower, uint64_t *pupper,
-                                             int64_t *pstride, int64_t incr,
-                                             int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_distribute_static_init_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t schedtype, int32_t *plastiter,
-                                          int32_t *plower, int32_t *pupper,
-                                          int32_t *pstride, int32_t incr,
-                                          int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t schedtype,
-                                           int32_t *plastiter, uint32_t *plower,
-                                           uint32_t *pupper, int32_t *pstride,
-                                           int32_t incr, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t schedtype, int32_t *plastiter,
-                                          int64_t *plower, int64_t *pupper,
-                                          int64_t *pstride, int64_t incr,
-                                          int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t schedtype,
-                                           int32_t *plastiter, uint64_t *plower,
-                                           uint64_t *pupper, int64_t *pstride,
-                                           int64_t incr, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t schedtype,
-                                             int32_t *plastiter,
-                                             int32_t *plower, int32_t *pupper,
-                                             int32_t *pstride, int32_t incr,
-                                             int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
-    int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t schedtype,
-                                             int32_t *plastiter,
-                                             int64_t *plower, int64_t *pupper,
-                                             int64_t *pstride, int64_t incr,
-                                             int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
-    int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_distribute_static_fini\n");
-}
-
-EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@ -1,65 +0,0 @@
-//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the data objects used on the GPU device.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/allocator.h"
-#include "common/omptarget.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// global device environment
-////////////////////////////////////////////////////////////////////////////////
-
-PLUGIN_ACCESSIBLE
-DeviceEnvironmentTy omptarget_device_environment;
-
-////////////////////////////////////////////////////////////////////////////////
-// global data holding OpenMP state information
-////////////////////////////////////////////////////////////////////////////////
-
-// OpenMP will try to call its ctor if we don't add the attribute explicitly
-[[clang::loader_uninitialized]] omptarget_nvptx_Queue<
-    omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
-    omptarget_nvptx_device_State[MAX_SM];
-
-omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
-uint32_t SHARED(usedMemIdx);
-uint32_t SHARED(usedSlotIdx);
-
-// SHARED doesn't work with array so we add the attribute explicitly.
-[[clang::loader_uninitialized]] uint8_t
-    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
-uint16_t SHARED(threadLimit);
-uint16_t SHARED(threadsInTeam);
-uint16_t SHARED(nThreads);
-// Pointer to this team's OpenMP state object
-omptarget_nvptx_ThreadPrivateContext *
-    SHARED(omptarget_nvptx_threadPrivateContext);
-
-////////////////////////////////////////////////////////////////////////////////
-// The team master sets the outlined parallel function in this variable to
-// communicate with the workers.  Since it is in shared memory, there is one
-// copy of these variables for each kernel, instance, and team.
-////////////////////////////////////////////////////////////////////////////////
-omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenMP kernel execution parameters
-////////////////////////////////////////////////////////////////////////////////
-int8_t SHARED(execution_param);
-
-////////////////////////////////////////////////////////////////////////////////
-// Scratchpad for teams reduction.
-////////////////////////////////////////////////////////////////////////////////
-void *SHARED(ReductionScratchpadPtr);
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@ -1,259 +0,0 @@
-//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the initialization code for the GPU
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "common/support.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// global data tables
-////////////////////////////////////////////////////////////////////////////////
-
-extern omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext,
-                             OMP_STATE_COUNT>
-    omptarget_nvptx_device_State[MAX_SM];
-
-////////////////////////////////////////////////////////////////////////////////
-// init entry points
-////////////////////////////////////////////////////////////////////////////////
-
-static void __kmpc_generic_kernel_init() {
-  PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
-        OMPTARGET_NVPTX_VERSION);
-
-  if (GetLaneId() == 0)
-    parallelLevel[GetWarpId()] = 0;
-
-  int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
-  if (threadIdInBlock != GetMasterThreadID())
-    return;
-
-  setExecutionParameters(OMP_TGT_EXEC_MODE_GENERIC, OMP_TGT_RUNTIME_INITIALIZED);
-  ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
-          "__kmpc_kernel_init() must be called by team master warp only!");
-  PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
-
-  // Get a state object from the queue.
-  int slot = __kmpc_impl_smid() % MAX_SM;
-  usedSlotIdx = slot;
-  omptarget_nvptx_threadPrivateContext =
-      omptarget_nvptx_device_State[slot].Dequeue();
-
-  // init thread private
-  int threadId = 0;
-  omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
-
-  // init team context
-  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  currTeamDescr.InitTeamDescr();
-  // this thread will start execution... has to update its task ICV
-  // to point to the level zero task ICV. That ICV was init in
-  // InitTeamDescr()
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, currTeamDescr.LevelZeroTaskDescr());
-
-  // set number of threads and thread limit in team to started value
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  nThreads = GetNumberOfWorkersInTeam();
-  threadLimit = nThreads;
-
-  __kmpc_data_sharing_init_stack();
-  __kmpc_impl_target_init();
-}
-
-static void __kmpc_generic_kernel_deinit() {
-  PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
-  // Enqueue omp state object for use by another team.
-  int slot = usedSlotIdx;
-  omptarget_nvptx_device_State[slot].Enqueue(
-      omptarget_nvptx_threadPrivateContext);
-  // Done with work.  Kill the workers.
-  omptarget_nvptx_workFn = 0;
-}
-
-static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) {
-  PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
-
-  setExecutionParameters(OMP_TGT_EXEC_MODE_SPMD,
-                         RequiresFullRuntime ? OMP_TGT_RUNTIME_INITIALIZED
-                                             : OMP_TGT_RUNTIME_UNINITIALIZED);
-  int threadId = __kmpc_get_hardware_thread_id_in_block();
-  if (threadId == 0) {
-    usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
-  }
-
-  if (GetLaneId() == 0) {
-    parallelLevel[GetWarpId()] =
-        1 + (__kmpc_get_hardware_num_threads_in_block() > 1
-                 ? OMP_ACTIVE_PARALLEL_LEVEL
-                 : 0);
-  }
-
-  __kmpc_data_sharing_init_stack();
-  if (!RequiresFullRuntime)
-    return;
-
-  //
-  // Team Context Initialization.
-  //
-  // In SPMD mode there is no master thread so use any cuda thread for team
-  // context initialization.
-  if (threadId == 0) {
-    // Get a state object from the queue.
-    omptarget_nvptx_threadPrivateContext =
-        omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
-
-    omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-    omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-    // init team context
-    currTeamDescr.InitTeamDescr();
-  }
-  __kmpc_impl_syncthreads();
-
-  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-
-  //
-  // Initialize task descr for each thread.
-  //
-  omptarget_nvptx_TaskDescr *newTaskDescr =
-      omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
-  newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
-  // install new top descriptor
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                             newTaskDescr);
-
-  // init thread private from init value
-  int ThreadLimit = GetNumberOfProcsInTeam(/* IsSPMD */ true);
-  PRINT(LD_PAR,
-        "thread will execute parallel region with id %d in a team of "
-        "%d threads\n",
-        (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
-}
-
-static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) {
-  // We're not going to pop the task descr stack of each thread since
-  // there are no more parallel regions in SPMD mode.
-  if (!RequiresFullRuntime)
-    return;
-
-  __kmpc_impl_syncthreads();
-  int threadId = __kmpc_get_hardware_thread_id_in_block();
-  if (threadId == 0) {
-    // Enqueue omp state object for use by another team.
-    int slot = usedSlotIdx;
-    omptarget_nvptx_device_State[slot].Enqueue(
-        omptarget_nvptx_threadPrivateContext);
-  }
-}
-
-// Return true if the current target region is executed in SPMD mode.
-// NOTE: This function has to return 1 for SPMD mode, and 0 for generic mode.
-// That's because `__kmpc_parallel_51` checks if it's already in parallel region
-// by comparision between the parallel level and the return value of this
-// function.
-EXTERN int8_t __kmpc_is_spmd_exec_mode() {
-  return (execution_param & OMP_TGT_EXEC_MODE_SPMD) == OMP_TGT_EXEC_MODE_SPMD;
-}
-
-EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
-  return !__kmpc_is_spmd_exec_mode() && __kmpc_is_generic_main_thread_id(Tid);
-}
-
-NOINLINE EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid) {
-  return GetMasterThreadID() == Tid;
-}
-
-EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
-
-static void __kmpc_target_region_state_machine(ident_t *Ident) {
-
-  int TId = __kmpc_get_hardware_thread_id_in_block();
-  do {
-    void* WorkFn = 0;
-
-    // Wait for the signal that we have a new work function.
-    __kmpc_barrier_simple_spmd(Ident, TId);
-
-
-    // Retrieve the work function from the runtime.
-    bool IsActive = __kmpc_kernel_parallel(&WorkFn);
-
-    // If there is nothing more to do, break out of the state machine by
-    // returning to the caller.
-    if (!WorkFn)
-      return;
-
-    if (IsActive) {
-      ((void(*)(uint32_t,uint32_t))WorkFn)(0, TId);
-      __kmpc_kernel_end_parallel();
-    }
-
-    __kmpc_barrier_simple_spmd(Ident, TId);
-
-  } while (true);
-}
-
-EXTERN
-int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
-                           bool UseGenericStateMachine,
-                           bool RequiresFullRuntime) {
-  const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
-  int TId = __kmpc_get_hardware_thread_id_in_block();
-  if (IsSPMD)
-    __kmpc_spmd_kernel_init(RequiresFullRuntime);
-  else
-    __kmpc_generic_kernel_init();
-
-   if (IsSPMD) {
-    __kmpc_barrier_simple_spmd(Ident, TId);
-     return -1;
-   }
-
-   if (TId == GetMasterThreadID())
-     return -1;
-
-  // Enter the generic state machine if enabled and if this thread can possibly
-  // be an active worker thread.
-  //
-  // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
-  // GPU.  In those cases, a single thread can apparently satisfy a barrier on
-  // behalf of all threads in the same warp.  Thus, it would not be safe for
-  // other threads in the main thread's warp to reach the first
-  // __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine
-  // before the main thread reaches its corresponding
-  // __kmpc_barrier_simple_spmd call: that would permit all active worker
-  // threads to proceed before the main thread has actually set
-  // omptarget_nvptx_workFn, and then they would immediately quit without
-  // doing any work.  GetNumberOfWorkersInTeam() does not include any of the
-  // main thread's warp, so none of its threads can ever be active worker
-  // threads.
-  if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam())
-    __kmpc_target_region_state_machine(Ident);
-
-  return TId;
-}
-
-EXTERN
-void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
-                          bool RequiresFullRuntime) {
-  const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
-  if (IsSPMD)
-    __kmpc_spmd_kernel_deinit(RequiresFullRuntime);
-  else
-    __kmpc_generic_kernel_deinit();
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@ -1,341 +0,0 @@
-//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-//    while (not finished) {
-//
-//    if (master) {
-//      sequential code, decide which par loop to do, or if finished
-//     __kmpc_kernel_prepare_parallel() // exec by master only
-//    }
-//    syncthreads // A
-//    __kmpc_kernel_parallel() // exec by all
-//    if (this thread is included in the parallel) {
-//      switch () for all parallel loops
-//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
-//    }
-//
-//
-//    The reason we don't exec end_parallel for the threads not included
-//    in the parallel loop is that for each barrier in the parallel
-//    region, these non-included threads will cycle through the
-//    syncthread A. Thus they must preserve their current threadId that
-//    is larger than thread in team.
-//
-//    To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// support for parallel that goes parallel (1 static level only)
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
-                                                uint16_t NThreadsICV,
-                                                uint16_t ThreadLimit) {
-  uint16_t ThreadsRequested = NThreadsICV;
-  if (NumThreadsClause != 0) {
-    ThreadsRequested = NumThreadsClause;
-  }
-
-  uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
-  if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
-    ThreadsAvailable = ThreadLimit;
-  }
-
-  uint16_t NumThreads = ThreadsAvailable;
-  if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
-    NumThreads = ThreadsRequested;
-  }
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  // On Volta and newer architectures we require that all lanes in
-  // a warp participate in the parallel region.  Round down to a
-  // multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (NumThreads < WARPSIZE) {
-    NumThreads = 1;
-  } else {
-    NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
-  }
-#endif
-
-  return NumThreads;
-}
-
-// This routine is always called by the team master..
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           kmp_int32 NumThreadsClause) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
-
-  omptarget_nvptx_workFn = WorkFn;
-
-  // This routine is only called by the team master.  The team master is
-  // the first thread of the last warp.  It always has the logical thread
-  // id of 0 (since it is a shadow for the first worker thread).
-  const int threadId = 0;
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
-  ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
-          "cannot be called in a parallel region.");
-  if (currTaskDescr->InParallelRegion()) {
-    PRINT0(LD_PAR, "already in parallel: go seq\n");
-    return;
-  }
-
-  uint16_t NumThreads =
-      determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
-
-  if (NumThreadsClause != 0) {
-    // Reset request to avoid propagating to successive #parallel
-    NumThreadsClause = 0;
-  }
-
-  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         (int)NumThreads);
-  ASSERT0(LT_FUSSY,
-          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
-          "only team master can create parallel");
-
-  // Set number of threads on work descriptor.
-  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-  workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
-  threadsInTeam = NumThreads;
-}
-
-// All workers call this function.  Deactivate those not needed.
-// Fn - the outlined work function to execute.
-// returns True if this thread is active, else False.
-//
-// Only the worker threads call this routine.
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
-
-  // Work function and arguments for L1 parallel region.
-  *WorkFn = omptarget_nvptx_workFn;
-
-  // If this is the termination signal from the master, quit early.
-  if (!*WorkFn) {
-    PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
-    return false;
-  }
-
-  // Only the worker threads call this routine and the master warp
-  // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = __kmpc_get_hardware_thread_id_in_block();
-  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-  // Set to true for workers participating in the parallel region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (threadId < threadsInTeam) {
-    // init work descriptor from workdesccr
-    omptarget_nvptx_TaskDescr *newTaskDescr =
-        omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
-    newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
-    // install new top descriptor
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               newTaskDescr);
-    // init private from int value
-    PRINT(LD_PAR,
-          "thread will execute parallel region with id %d in a team of "
-          "%d threads\n",
-          (int)newTaskDescr->ThreadId(), (int)nThreads);
-
-    isActive = true;
-  }
-
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_parallel() {
-  // pop stack
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  // Only the worker threads call this routine and the master warp
-  // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = __kmpc_get_hardware_thread_id_in_block();
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, currTaskDescr->GetPrevTaskDescr());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// support for parallel that goes sequential
-////////////////////////////////////////////////////////////////////////////////
-
-static void serializedParallel(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to serializedParallel\n");
-
-  IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
-
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
-            "Expected SPMD mode with uninitialized runtime.");
-    return;
-  }
-
-  // assume this is only called for nested parallel
-  int threadId = GetLogicalThreadIdInBlock();
-
-  // unlike actual parallel, threads in the same team do not share
-  // the workTaskDescr in this case and num threads is fixed to 1
-
-  // get current task
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
-  currTaskDescr->SaveLoopData();
-
-  // allocate new task descriptor and copy value from current one, set prev to
-  // it
-  omptarget_nvptx_TaskDescr *newTaskDescr =
-      (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
-                                              "new seq parallel task");
-  newTaskDescr->CopyParent(currTaskDescr);
-
-  // tweak values for serialized parallel case:
-  // - each thread becomes ID 0 in its serialized parallel, and
-  // - there is only one thread per team
-  newTaskDescr->ThreadId() = 0;
-
-  // set new task descriptor as top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                             newTaskDescr);
-}
-
-static void endSerializedParallel(kmp_Ident *loc,
-                                           uint32_t global_tid) {
-  PRINT0(LD_IO, "call to endSerializedParallel\n");
-
-  DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
-
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
-            "Expected SPMD mode with uninitialized runtime.");
-    return;
-  }
-
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
-  // set new top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, currTaskDescr->GetPrevTaskDescr());
-  // free
-  SafeFree(currTaskDescr, "new seq parallel task");
-  currTaskDescr = getMyTopTaskDescriptor(threadId);
-  currTaskDescr->RestoreLoopData();
-}
-
-NOINLINE EXTERN uint8_t __kmpc_parallel_level() {
-  return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
-}
-
-// This kmpc call returns the thread id across all teams. It's value is
-// cached by the compiler and used when calling the runtime. On nvptx
-// it's cheap to recalculate this value so we never use the result
-// of this call.
-EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
-  return GetOmpThreadId();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// push params
-////////////////////////////////////////////////////////////////////////////////
-
-// Do nothing. The host guarantees we started the requested number of
-// teams and we only need inspection of gridDim.
-
-EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
-                                  int32_t num_teams, int32_t thread_limit) {
-  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
-  ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device");
-}
-
-EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) {
-  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// parallel interface
-////////////////////////////////////////////////////////////////////////////////
-
-NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
-                                        kmp_int32 if_expr,
-                                        kmp_int32 num_threads, int proc_bind,
-                                        void *fn, void *wrapper_fn, void **args,
-                                        size_t nargs) {
-  // Handle the serialized case first, same for SPMD/non-SPMD except that in
-  // SPMD mode we already incremented the parallel level counter, account for
-  // that.
-  bool InParallelRegion =
-      (__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode());
-  if (!if_expr || InParallelRegion) {
-    serializedParallel(ident, global_tid);
-    __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
-    endSerializedParallel(ident, global_tid);
-    return;
-  }
-
-  if (__kmpc_is_spmd_exec_mode()) {
-    __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
-    return;
-  }
-
-  __kmpc_kernel_prepare_parallel((void *)wrapper_fn, num_threads);
-
-  if (nargs) {
-    void **GlobalArgs;
-    __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
-    // TODO: faster memcpy?
-#pragma unroll
-    for (int I = 0; I < nargs; I++)
-      GlobalArgs[I] = args[I];
-  }
-
-  // TODO: what if that's a parallel region with a single thread? this is
-  // considered not active in the existing implementation.
-  bool IsActiveParallelRegion = threadsInTeam != 1;
-  int NumWarps =
-      threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0);
-  // Increment parallel level for non-SPMD warps.
-  for (int I = 0; I < NumWarps; ++I)
-    parallelLevel[I] +=
-        (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-
-  // Master signals work to activate workers.
-  __kmpc_barrier_simple_spmd(ident, 0);
-
-  // OpenMP [2.5, Parallel Construct, p.49]
-  // There is an implied barrier at the end of a parallel region. After the
-  // end of a parallel region, only the master thread of the team resumes
-  // execution of the enclosing task region.
-  //
-  // The master waits at this barrier until all workers are done.
-  __kmpc_barrier_simple_spmd(ident, 0);
-
-  // Decrement parallel level for non-SPMD warps.
-  for (int I = 0; I < NumWarps; ++I)
-    parallelLevel[I] -=
-        (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-  // TODO: Is synchronization needed since out of parallel execution?
-
-  if (nargs)
-    __kmpc_end_sharing_variables();
-
-  // TODO: proc_bind is a noop?
-  // if (proc_bind != proc_bind_default)
-  //  __kmpc_push_proc_bind(ident, global_tid, proc_bind);
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@ -1,309 +0,0 @@
-//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of reduction with KMPC interface.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target/shuffle.h"
-#include "target_impl.h"
-
-EXTERN
-void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
-
-EXTERN
-void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
-
-INLINE static void gpu_regular_warp_reduce(void *reduce_data,
-                                           kmp_ShuffleReductFctPtr shflFct) {
-  for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
-    shflFct(reduce_data, /*LaneId - not used= */ 0,
-            /*Offset = */ mask, /*AlgoVersion=*/0);
-  }
-}
-
-INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
-                                             kmp_ShuffleReductFctPtr shflFct,
-                                             uint32_t size, uint32_t tid) {
-  uint32_t curr_size;
-  uint32_t mask;
-  curr_size = size;
-  mask = curr_size / 2;
-  while (mask > 0) {
-    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
-    curr_size = (curr_size + 1) / 2;
-    mask = curr_size / 2;
-  }
-}
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
-INLINE static uint32_t
-gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
-  uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
-  __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
-  do {
-    Liveness = __kmpc_impl_activemask();
-    remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
-    size = __kmpc_impl_popc(Liveness);
-    logical_lane_id /= 2;
-    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-  return (logical_lane_id == 0);
-}
-#endif
-
-INLINE
-static int32_t nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
-  if (NumThreads == 1)
-    return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
-  uint32_t WarpId = BlockThreadId / WARPSIZE;
-
-  // Volta execution model:
-  // For the Generic execution mode a parallel region either has 1 thread and
-  // beyond that, always a multiple of 32. For the SPMD execution mode we may
-  // have any number of threads.
-  if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(
-        reduce_data, shflFct,
-        /*LaneCount=*/NumThreads % WARPSIZE,
-        /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > WARPSIZE) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-  }
-  return BlockThreadId == 0;
-#else
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(
-        reduce_data, shflFct,
-        /*LaneCount=*/__kmpc_impl_popc(Liveness),
-        /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
-  else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
-                                    // parallel region may enter here; return
-                                    // early.
-    return gpu_irregular_simd_reduce(reduce_data, shflFct);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > WARPSIZE) {
-    uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = BlockThreadId / WARPSIZE;
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-
-    return BlockThreadId == 0;
-  } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
-    return BlockThreadId == 0;
-  }
-
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
-  return global_tid == 0;
-#endif // __CUDA_ARCH__ >= 700
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
-    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      __kmpc_is_spmd_exec_mode(), isRuntimeUninitialized());
-}
-
-INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
-  return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId);
-}
-
-INLINE static uint32_t roundToWarpsize(uint32_t s) {
-  if (s < WARPSIZE)
-    return 1;
-  return (s & ~(unsigned)(WARPSIZE - 1));
-}
-
-INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-static volatile uint32_t IterCnt = 0;
-static volatile uint32_t Cnt = 0;
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
-    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
-    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
-    kmp_ListGlobalFctPtr glredFct) {
-
-  // Terminate all threads in non-SPMD mode except for the master thread.
-  if (!__kmpc_is_spmd_exec_mode() &&
-      !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
-    return 0;
-
-  uint32_t ThreadId = GetLogicalThreadIdInBlock();
-
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads =
-      __kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
-                         : /*Master thread only*/ 1;
-  uint32_t TeamId = GetBlockIdInKernel();
-  uint32_t NumTeams = __kmpc_get_hardware_num_blocks();
-  static unsigned SHARED(Bound);
-  static unsigned SHARED(ChunkTeamCount);
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMaster = isMaster(loc, ThreadId);
-  while (IsMaster) {
-    // Atomic read
-    Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
-    if (TeamId < Bound + num_of_records)
-      break;
-  }
-
-  if (IsMaster) {
-    int ModBockId = TeamId % num_of_records;
-    if (TeamId < num_of_records)
-      lgcpyFct(global_buffer, ModBockId, reduce_data);
-    else
-      lgredFct(global_buffer, ModBockId, reduce_data);
-    __kmpc_impl_threadfence_system();
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // BUFFER_SIZE chunk.
-    ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
-  }
-  // Synchronize
-  if (__kmpc_is_spmd_exec_mode())
-    __kmpc_barrier(loc, global_tid);
-
-  // reduce_data is global or shared so before being reduced within the
-  // warp we need to bring it in local memory:
-  // local_reduce_data = reduce_data[i]
-  //
-  // Example for 3 reduction variables a, b, c (of potentially different
-  // types):
-  //
-  // buffer layout (struct of arrays):
-  // a, a, ..., a, b, b, ... b, c, c, ... c
-  // |__________|
-  //     num_of_records
-  //
-  // local_data_reduce layout (struct):
-  // a, b, c
-  //
-  // Each thread will have a local struct containing the values to be
-  // reduced:
-  //      1. do reduction within each warp.
-  //      2. do reduction across warps.
-  //      3. write the final result to the main reduction variable
-  //         by returning 1 in the thread holding the reduction result.
-
-  // Check if this is the very last team.
-  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
-  if (ChunkTeamCount == NumTeams - Bound - 1) {
-    //
-    // Last team processing.
-    //
-    if (ThreadId >= NumRecs)
-      return 0;
-    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
-    if (ThreadId >= NumThreads)
-      return 0;
-
-    // Load from buffer and reduce.
-    glcpyFct(global_buffer, ThreadId, reduce_data);
-    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
-      glredFct(global_buffer, i, reduce_data);
-
-    // Reduce across warps to the warp master.
-    if (NumThreads > 1) {
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-
-      // When we have more than [warpsize] number of threads
-      // a block reduction is performed here.
-      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
-      if (ActiveThreads > WARPSIZE) {
-        uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-        // Gather all the reduced values from each warp
-        // to the first warp.
-        cpyFct(reduce_data, WarpsNeeded);
-
-        uint32_t WarpId = ThreadId / WARPSIZE;
-        if (WarpId == 0)
-          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                    ThreadId);
-      }
-    }
-
-    if (IsMaster) {
-      Cnt = 0;
-      IterCnt = 0;
-      return 1;
-    }
-    return 0;
-  }
-  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
-    // Allow SIZE number of teams to proceed writing their
-    // intermediate results to the global buffer.
-    __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
-  }
-
-  return 0;
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
+++ b/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
@ -1,29 +0,0 @@
-//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "target/shuffle.h"
-
-#pragma omp declare target
-
-static constexpr uint64_t AllLanes = -1;
-
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
-  return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size);
-}
-
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
-  uint32_t lo, hi;
-  __kmpc_impl_unpack(val, lo, hi);
-  hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size);
-  lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size);
-  return __kmpc_impl_pack(lo, hi);
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@ -1,240 +0,0 @@
-//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Wrapper implementation to some functions natively supported by the GPU.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "common/omptarget.h"
-#include "common/support.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Parameters
-////////////////////////////////////////////////////////////////////////////////
-
-void setExecutionParameters(OMPTgtExecModeFlags EMode,
-                            OMPTgtRuntimeModeFlags RMode) {
-  execution_param = EMode;
-  execution_param |= RMode;
-}
-
-bool isGenericMode() { return execution_param & OMP_TGT_EXEC_MODE_GENERIC; }
-
-bool isRuntimeUninitialized() { return !isRuntimeInitialized(); }
-
-bool isRuntimeInitialized() {
-  return execution_param & OMP_TGT_RUNTIME_INITIALIZED;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// support: get info from machine
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
-//
-////////////////////////////////////////////////////////////////////////////////
-
-// The master thread id is the first thread (lane) of the last warp.
-// Thread id is 0 indexed.
-// E.g: If NumThreads is 33, master id is 32.
-//      If NumThreads is 64, master id is 32.
-//      If NumThreads is 97, master id is 96.
-//      If NumThreads is 1024, master id is 992.
-//
-// Called in Generic Execution Mode only.
-int GetMasterThreadID() {
-  return (__kmpc_get_hardware_num_threads_in_block() - 1) & ~(WARPSIZE - 1);
-}
-
-// The last warp is reserved for the master; other warps are workers.
-// Called in Generic Execution Mode only.
-int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
-
-////////////////////////////////////////////////////////////////////////////////
-// get thread id in team
-
-// This function may be called in a parallel region by the workers
-// or a serial region by the master.  If the master (whose CUDA thread
-// id is GetMasterThreadID()) calls this routine, we return 0 because
-// it is a shadow for the first worker.
-int GetLogicalThreadIdInBlock() {
-  // Implemented using control flow (predication) instead of with a modulo
-  // operation.
-  int tid = __kmpc_get_hardware_thread_id_in_block();
-  if (__kmpc_is_generic_main_thread(tid))
-    return 0;
-  else
-    return tid;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// OpenMP Thread Support Layer
-//
-////////////////////////////////////////////////////////////////////////////////
-
-int GetOmpThreadId() {
-  int tid = __kmpc_get_hardware_thread_id_in_block();
-  if (__kmpc_is_generic_main_thread(tid))
-    return 0;
-  // omp_thread_num
-  int rc;
-  if (__kmpc_parallel_level() > 1) {
-    rc = 0;
-  } else if (__kmpc_is_spmd_exec_mode()) {
-    rc = tid;
-  } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-    ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
-    rc = currTaskDescr->ThreadId();
-  }
-  return rc;
-}
-
-int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
-  // omp_num_threads
-  int rc;
-  int Level = parallelLevel[GetWarpId()];
-  if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
-    rc = 1;
-  } else if (isSPMDExecutionMode) {
-    rc = __kmpc_get_hardware_num_threads_in_block();
-  } else {
-    rc = threadsInTeam;
-  }
-
-  return rc;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Team id linked to OpenMP
-
-int GetOmpTeamId() {
-  // omp_team_num
-  return GetBlockIdInKernel(); // assume 1 block per team
-}
-
-int GetNumberOfOmpTeams() {
-  // omp_num_teams
-  return __kmpc_get_hardware_num_blocks(); // assume 1 block per team
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Masters
-
-int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
-
-////////////////////////////////////////////////////////////////////////////////
-// Parallel level
-
-void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
-  __kmpc_impl_syncwarp(Mask);
-  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
-  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
-  if (Rank == 0) {
-    parallelLevel[GetWarpId()] +=
-        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __kmpc_impl_threadfence();
-  }
-  __kmpc_impl_syncwarp(Mask);
-}
-
-void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
-  __kmpc_impl_syncwarp(Mask);
-  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
-  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
-  if (Rank == 0) {
-    parallelLevel[GetWarpId()] -=
-        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __kmpc_impl_threadfence();
-  }
-  __kmpc_impl_syncwarp(Mask);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// get OpenMP number of procs
-
-// Get the number of processors in the device.
-int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
-  if (!isSPMDExecutionMode)
-    return GetNumberOfWorkersInTeam();
-  return __kmpc_get_hardware_num_threads_in_block();
-}
-
-int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
-  return GetNumberOfProcsInDevice(isSPMDExecutionMode);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory
-////////////////////////////////////////////////////////////////////////////////
-
-unsigned long PadBytes(unsigned long size,
-                       unsigned long alignment) // must be a power of 2
-{
-  // compute the necessary padding to satisfy alignment constraint
-  ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
-         "alignment %lu is not a power of 2\n", alignment);
-  return (~(unsigned long)size + 1) & (alignment - 1);
-}
-
-void *SafeMalloc(size_t size, const char *msg) // check if success
-{
-  void *ptr = __kmpc_impl_malloc(size);
-  PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
-        (unsigned long long)size, msg, (unsigned long long)ptr);
-  return ptr;
-}
-
-void *SafeFree(void *ptr, const char *msg) {
-  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
-  __kmpc_impl_free(ptr);
-  return NULL;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Teams Reduction Scratchpad Helpers
-////////////////////////////////////////////////////////////////////////////////
-
-unsigned int *GetTeamsReductionTimestamp() {
-  return static_cast<unsigned int *>(ReductionScratchpadPtr);
-}
-
-char *GetTeamsReductionScratchpad() {
-  return static_cast<char *>(ReductionScratchpadPtr) + 256;
-}
-
-// Invoke an outlined parallel function unwrapping arguments (up
-// to 32).
-void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
-                            void **args, size_t nargs) {
-  switch (nargs) {
-#include "common/generated_microtask_cases.gen"
-  default:
-    printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
-    __builtin_trap();
-  }
-}
-
-namespace _OMP {
-/// Helper to keep code alive without introducing a performance penalty.
-__attribute__((used, retain, weak, optnone, cold)) void keepAlive() {
-  __kmpc_get_hardware_thread_id_in_block();
-  __kmpc_get_hardware_num_threads_in_block();
-  __kmpc_get_warp_size();
-  __kmpc_barrier_simple_spmd(nullptr, 0);
-  __kmpc_barrier_simple_generic(nullptr, 0);
-}
-} // namespace _OMP
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@ -1,143 +0,0 @@
-//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Include all synchronization.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP Ordered calls
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_ordered\n");
-}
-
-EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_end_ordered\n");
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP Barriers
-////////////////////////////////////////////////////////////////////////////////
-
-// a team is a block: we can use CUDA native synchronization mechanism
-// FIXME: what if not all threads (warps) participate to the barrier?
-// We may need to implement it differently
-
-EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
-  __kmpc_barrier(loc_ref, tid);
-  PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
-  return 0;
-}
-
-EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
-            "Expected SPMD mode with uninitialized runtime.");
-    __kmpc_barrier_simple_spmd(loc_ref, tid);
-  } else {
-    tid = GetLogicalThreadIdInBlock();
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
-    if (numberOfActiveOMPThreads > 1) {
-      if (__kmpc_is_spmd_exec_mode()) {
-        __kmpc_barrier_simple_spmd(loc_ref, tid);
-      } else {
-        // The #threads parameter must be rounded up to the WARPSIZE.
-        int threads =
-            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-        PRINT(LD_SYNC,
-              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-              (int)numberOfActiveOMPThreads, (int)threads);
-        __kmpc_impl_named_sync(threads);
-      }
-    } else {
-      // Still need to flush the memory per the standard.
-      __kmpc_flush(loc_ref);
-    } // numberOfActiveOMPThreads > 1
-    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
-  }
-}
-
-// Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0
-// parallel region and that all worker threads participate.
-EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
-  PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
-  __kmpc_impl_syncthreads();
-  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
-}
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
-  return __kmpc_barrier_simple_spmd(loc_ref, tid);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP MASTER
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_master\n");
-  return IsTeamMaster(global_tid);
-}
-
-EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_end_master\n");
-  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP SINGLE
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_single\n");
-  // decide to implement single with master; master get the single
-  return IsTeamMaster(global_tid);
-}
-
-EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_end_single\n");
-  // decide to implement single with master: master get the single
-  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
-  // sync barrier is explicitly called... so that is not a problem
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Flush
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_flush(kmp_Ident *loc) {
-  PRINT0(LD_IO, "call kmpc_flush\n");
-  __kmpc_impl_threadfence();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Vote
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN uint64_t __kmpc_warp_active_thread_mask(void) {
-  PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
-  return __kmpc_impl_activemask();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Syncwarp
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_syncwarp(uint64_t Mask) {
-  PRINT0(LD_IO, "call __kmpc_syncwarp\n");
-  __kmpc_impl_syncwarp(Mask);
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/task.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu
@ -1,219 +0,0 @@
-//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Task implementation support.
-//
-//  explicit task structure uses
-//  omptarget_nvptx task
-//  kmp_task
-//
-//  where kmp_task is
-//    - klegacy_TaskDescr    <- task pointer
-//        shared -> X
-//        routine
-//        part_id
-//        descr
-//    -  private (of size given by task_alloc call). Accessed by
-//       task+sizeof(klegacy_TaskDescr)
-//        * private data *
-//    - shared: X. Accessed by shared ptr in klegacy_TaskDescr
-//        * pointer table to shared variables *
-//    - end
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-
-EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
-    kmp_Ident *loc,      // unused
-    uint32_t global_tid, // unused
-    int32_t flag, // unused (because in our impl, all are immediately exec
-    size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
-    kmp_TaskFctPtr taskSub) {
-  PRINT(LD_IO,
-        "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
-        "fct 0x%llx)\n",
-        (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
-        (unsigned long long)taskSub);
-  // want task+priv to be a multiple of 8 bytes
-  size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
-  sizeOfTaskInclPrivate += padForTaskInclPriv;
-  size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
-  ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
-         "need task descr of size %d to be a multiple of %d\n",
-         (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
-  size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
-          totSize, "explicit task descriptor");
-  kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
-  ASSERT0(LT_FUSSY,
-          (uint64_t)newKmpTaskDescr ==
-              (uint64_t)ADD_BYTES(newExplicitTaskDescr,
-                                  sizeof(omptarget_nvptx_TaskDescr)),
-          "bad size assumptions");
-  // init kmp_TaskDescr
-  newKmpTaskDescr->sharedPointerTable =
-      (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
-  newKmpTaskDescr->sub = taskSub;
-  newKmpTaskDescr->destructors = NULL;
-  PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
-        (unsigned long long)newKmpTaskDescr,
-        (unsigned long long)newExplicitTaskDescr);
-
-  return newKmpTaskDescr;
-}
-
-EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
-                               kmp_TaskDescr *newKmpTaskDescr) {
-  return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
-                                   0);
-}
-
-EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newKmpTaskDescr,
-                                         int32_t depNum, void *depList,
-                                         int32_t noAliasDepNum,
-                                         void *noAliasDepList) {
-  PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
-        P64(newKmpTaskDescr));
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(),
-          "Runtime must be initialized.");
-  // 1. get explicit task descr from kmp task descr
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
-          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
-  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
-          "bad assumptions");
-  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
-  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
-          "bad assumptions");
-
-  // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
-  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
-  // set new task descriptor as top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
-
-  // 3. call sub
-  PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
-        (unsigned long long)newKmpTaskDescr->sub,
-        (unsigned long long)newKmpTaskDescr);
-  newKmpTaskDescr->sub(0, newKmpTaskDescr);
-  PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
-        (unsigned long long)newKmpTaskDescr->sub);
-
-  // 4. pop context
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
-                                                             parentTaskDescr);
-  // 5. free
-  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
-  return 0;
-}
-
-EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
-                                      kmp_TaskDescr *newKmpTaskDescr) {
-  PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
-        (unsigned long long)newKmpTaskDescr);
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(),
-          "Runtime must be initialized.");
-  // 1. get explicit task descr from kmp task descr
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
-          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
-  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
-          "bad assumptions");
-  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
-  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
-          "bad assumptions");
-
-  // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
-  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
-  // set new task descriptor as top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
-  // 3... noting to call... is inline
-  // 4 & 5 ... done in complete
-}
-
-EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newKmpTaskDescr) {
-  PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
-        (unsigned long long)newKmpTaskDescr);
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(),
-          "Runtime must be initialized.");
-  // 1. get explicit task descr from kmp task descr
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
-          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
-  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
-          "bad assumptions");
-  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
-  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
-          "bad assumptions");
-  // 2. get parent
-  omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
-  // 3... noting to call... is inline
-  // 4. pop context
-  int tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
-                                                             parentTaskDescr);
-  // 5. free
-  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
-}
-
-EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
-                                 int32_t depNum, void *depList,
-                                 int32_t noAliasDepNum, void *noAliasDepList) {
-  PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
-  // nothing to do as all our tasks are executed as final
-}
-
-EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
-  // nothing to do as all our tasks are executed as final
-}
-
-EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
-  // nothing to do as all our tasks are executed as final
-}
-
-EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
-                                    int end_part) {
-  PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
-  // do nothing: tasks are executed immediately, no yielding allowed
-  return 0;
-}
-
-EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
-  // nothing to do as all our tasks are executed as final
-  return 0;
-}
-
-EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
-                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
-                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
-                            int32_t sched, uint64_t grainsize, void *task_dup) {
-
-  // skip task entirely if empty iteration space
-  if (*lb > *ub)
-    return;
-
-  // the compiler has already stored lb and ub in the kmp_TaskDescr structure
-  // as we are using a single task to execute the entire loop, we can leave
-  // the initial task_t untouched
-
-  __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/state-queue.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h
@ -1,51 +0,0 @@
-//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a queue to hand out OpenMP state objects to teams of
-// one or more kernels.
-//
-// Reference:
-// Thomas R.W. Scogland and Wu-chun Feng. 2015.
-// Design and Evaluation of Scalable Concurrent Queues for Many-Core
-// Architectures. International Conference on Performance Engineering.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __STATE_QUEUE_H
-#define __STATE_QUEUE_H
-
-#include <stdint.h>
-
-#include "target_impl.h"
-
-template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
-private:
-  ElementType elements[SIZE];
-  volatile ElementType *elementQueue[SIZE];
-  volatile uint32_t head;
-  volatile uint32_t ids[SIZE];
-  volatile uint32_t tail;
-
-  static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
-  INLINE uint32_t ENQUEUE_TICKET();
-  INLINE uint32_t DEQUEUE_TICKET();
-  INLINE static uint32_t ID(uint32_t ticket);
-  INLINE bool IsServing(uint32_t slot, uint32_t id);
-  INLINE void PushElement(uint32_t slot, ElementType *element);
-  INLINE ElementType *PopElement(uint32_t slot);
-  INLINE void DoneServing(uint32_t slot, uint32_t id);
-
-public:
-  INLINE omptarget_nvptx_Queue() {}
-  INLINE void Enqueue(ElementType *element);
-  INLINE ElementType *Dequeue();
-};
-
-#include "state-queuei.h"
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
@ -1,88 +0,0 @@
-//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of a queue to hand out OpenMP state
-// objects to teams of one or more kernels.
-//
-// Reference:
-// Thomas R.W. Scogland and Wu-chun Feng. 2015.
-// Design and Evaluation of Scalable Concurrent Queues for Many-Core
-// Architectures. International Conference on Performance Engineering.
-//
-//===----------------------------------------------------------------------===//
-
-#include "state-queue.h"
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
-  return __kmpc_atomic_add((unsigned int *)&tail, 1u);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
-  return __kmpc_atomic_add((unsigned int *)&head, 1u);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
-  return (ticket / SIZE) * 2;
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
-                                                                uint32_t id) {
-  return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void
-omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
-                                                      ElementType *element) {
-  __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
-                         (unsigned long long)element);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE ElementType *
-omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
-  return (ElementType *)__kmpc_atomic_add(
-      (unsigned long long *)&elementQueue[slot], (unsigned long long)0);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
-                                                                  uint32_t id) {
-  __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void
-omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
-  uint32_t ticket = ENQUEUE_TICKET();
-  uint32_t slot = ticket % SIZE;
-  uint32_t id = ID(ticket) + 1;
-  while (!IsServing(slot, id))
-    ;
-  PushElement(slot, element);
-  DoneServing(slot, id);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
-  uint32_t ticket = DEQUEUE_TICKET();
-  uint32_t slot = ticket % SIZE;
-  uint32_t id = ID(ticket);
-  while (!IsServing(slot, id))
-    ;
-  ElementType *element = PopElement(slot);
-  // This is to populate the queue because of the lack of GPU constructors.
-  if (element == 0)
-    element = &elements[slot];
-  DoneServing(slot, id);
-  return element;
-}
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@ -1,91 +0,0 @@
-//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Wrapper to some functions natively supported by the GPU.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SUPPORT_H
-#define OMPTARGET_SUPPORT_H
-
-#include "interface.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Parameters
-////////////////////////////////////////////////////////////////////////////////
-enum OMPTgtExecModeFlags : int8_t {
-  OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
-  OMP_TGT_EXEC_MODE_SPMD = 1 << 1
-};
-
-enum OMPTgtRuntimeModeFlags : int8_t {
-  OMP_TGT_RUNTIME_UNINITIALIZED = 0,
-  OMP_TGT_RUNTIME_INITIALIZED = 1 << 2
-};
-
-void setExecutionParameters(OMPTgtExecModeFlags EMode,
-                            OMPTgtRuntimeModeFlags RMode);
-bool isGenericMode();
-bool isRuntimeUninitialized();
-bool isRuntimeInitialized();
-
-////////////////////////////////////////////////////////////////////////////////
-// get info from machine
-////////////////////////////////////////////////////////////////////////////////
-
-// get global ids to locate tread/team info (constant regardless of OMP)
-int GetLogicalThreadIdInBlock();
-int GetMasterThreadID();
-int GetNumberOfWorkersInTeam();
-
-// get OpenMP thread and team ids
-int GetOmpThreadId();                         // omp_thread_num
-int GetOmpTeamId();                           // omp_team_num
-
-// get OpenMP number of threads and team
-int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-int GetNumberOfOmpTeams();                           // omp_num_teams
-
-// get OpenMP number of procs
-int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
-
-// masters
-int IsTeamMaster(int ompThreadId);
-
-// Parallel level
-void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory
-////////////////////////////////////////////////////////////////////////////////
-
-// safe alloc and free
-void *SafeMalloc(size_t size, const char *msg); // check if success
-void *SafeFree(void *ptr, const char *msg);
-// pad to a alignment (power of 2 only)
-unsigned long PadBytes(unsigned long size, unsigned long alignment);
-#define ADD_BYTES(_addr, _bytes)                                               \
-  ((void *)((char *)((void *)(_addr)) + (_bytes)))
-#define SUB_BYTES(_addr, _bytes)                                               \
-  ((void *)((char *)((void *)(_addr)) - (_bytes)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Teams Reduction Scratchpad Helpers
-////////////////////////////////////////////////////////////////////////////////
-unsigned int *GetTeamsReductionTimestamp();
-char *GetTeamsReductionScratchpad();
-
-// Invoke an outlined parallel function unwrapping global, shared arguments (up
-// to 128).
-void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
-                            void **args, size_t nargs);
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@ -1,505 +0,0 @@
-//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file contains all the definitions that are relevant to
-//  the interface. The first section contains the interface as
-//  declared by OpenMP.  The second section includes the compiler
-//  specific interfaces.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _INTERFACES_H_
-#define _INTERFACES_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __AMDGCN__
-#include "amdgcn/src/amdgcn_interface.h"
-#endif
-#ifdef __CUDACC__
-#include "nvptx/src/nvptx_interface.h"
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenMP interface
-////////////////////////////////////////////////////////////////////////////////
-
-typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
-
-typedef enum omp_sched_t {
-  omp_sched_static = 1,  /* chunkSize >0 */
-  omp_sched_dynamic = 2, /* chunkSize >0 */
-  omp_sched_guided = 3,  /* chunkSize >0 */
-  omp_sched_auto = 4,    /* no chunkSize */
-} omp_sched_t;
-
-typedef enum omp_proc_bind_t {
-  omp_proc_bind_false = 0,
-  omp_proc_bind_true = 1,
-  omp_proc_bind_master = 2,
-  omp_proc_bind_close = 3,
-  omp_proc_bind_spread = 4
-} omp_proc_bind_t;
-
-EXTERN double omp_get_wtick(void);
-EXTERN double omp_get_wtime(void);
-
-EXTERN void omp_set_num_threads(int num);
-EXTERN int omp_get_num_threads(void);
-EXTERN int omp_get_max_threads(void);
-EXTERN int omp_get_thread_limit(void);
-EXTERN int omp_get_thread_num(void);
-EXTERN int omp_get_num_procs(void);
-EXTERN int omp_in_parallel(void);
-EXTERN int omp_in_final(void);
-EXTERN void omp_set_dynamic(int flag);
-EXTERN int omp_get_dynamic(void);
-EXTERN void omp_set_nested(int flag);
-EXTERN int omp_get_nested(void);
-EXTERN void omp_set_max_active_levels(int level);
-EXTERN int omp_get_max_active_levels(void);
-EXTERN int omp_get_level(void);
-EXTERN int omp_get_active_level(void);
-EXTERN int omp_get_ancestor_thread_num(int level);
-EXTERN int omp_get_team_size(int level);
-
-EXTERN void omp_init_lock(omp_lock_t *lock);
-EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_destroy_lock(omp_lock_t *lock);
-EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_set_lock(omp_lock_t *lock);
-EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_unset_lock(omp_lock_t *lock);
-EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
-EXTERN int omp_test_lock(omp_lock_t *lock);
-EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
-
-EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
-EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
-EXTERN omp_proc_bind_t omp_get_proc_bind(void);
-EXTERN int omp_get_cancellation(void);
-EXTERN void omp_set_default_device(int deviceId);
-EXTERN int omp_get_default_device(void);
-EXTERN int omp_get_num_devices(void);
-EXTERN int omp_get_num_teams(void);
-EXTERN int omp_get_team_num(void);
-EXTERN int omp_get_initial_device(void);
-EXTERN int omp_get_max_task_priority(void);
-
-EXTERN void *llvm_omp_get_dynamic_shared();
-
-////////////////////////////////////////////////////////////////////////////////
-// file below is swiped from kmpc host interface
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// kmp specific types
-////////////////////////////////////////////////////////////////////////////////
-
-typedef enum kmp_sched_t {
-  kmp_sched_static_chunk = 33,
-  kmp_sched_static_nochunk = 34,
-  kmp_sched_dynamic = 35,
-  kmp_sched_guided = 36,
-  kmp_sched_runtime = 37,
-  kmp_sched_auto = 38,
-
-  kmp_sched_static_balanced_chunk = 45,
-
-  kmp_sched_static_ordered = 65,
-  kmp_sched_static_nochunk_ordered = 66,
-  kmp_sched_dynamic_ordered = 67,
-  kmp_sched_guided_ordered = 68,
-  kmp_sched_runtime_ordered = 69,
-  kmp_sched_auto_ordered = 70,
-
-  kmp_sched_distr_static_chunk = 91,
-  kmp_sched_distr_static_nochunk = 92,
-  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
-
-  kmp_sched_default = kmp_sched_static_nochunk,
-  kmp_sched_unordered_first = kmp_sched_static_chunk,
-  kmp_sched_unordered_last = kmp_sched_auto,
-  kmp_sched_ordered_first = kmp_sched_static_ordered,
-  kmp_sched_ordered_last = kmp_sched_auto_ordered,
-  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
-  kmp_sched_distribute_last =
-      kmp_sched_distr_static_chunk_sched_static_chunkone,
-
-  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
-   * Since we need to distinguish the three possible cases (no modifier,
-   * monotonic modifier, nonmonotonic modifier), we need separate bits for
-   * each modifier. The absence of monotonic does not imply nonmonotonic,
-   * especially since 4.5 says that the behaviour of the "no modifier" case
-   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
-   *
-   * Since we're passing a full 32 bit value, we can use a couple of high
-   * bits for these flags; out of paranoia we avoid the sign bit.
-   *
-   * These modifiers can be or-ed into non-static schedules by the compiler
-   * to pass the additional information. They will be stripped early in the
-   * processing in __kmp_dispatch_init when setting up schedules, so
-   * most of the code won't ever see schedules with these bits set.
-   */
-  kmp_sched_modifier_monotonic = (1 << 29),
-  /**< Set if the monotonic schedule modifier was present */
-  kmp_sched_modifier_nonmonotonic = (1 << 30),
-/**< Set if the nonmonotonic schedule modifier was present */
-
-#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
-  (enum kmp_sched_t)(                                                          \
-      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
-#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
-  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
-#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
-  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
-   0)
-
-} kmp_sched_t;
-
-/*!
- * Enum for accesseing the reserved_2 field of the ident_t struct below.
- */
-enum {
-  /*! Bit set to 1 when in SPMD mode. */
-  KMP_IDENT_SPMD_MODE = 0x01,
-  /*! Bit set to 1 when a simplified runtime is used. */
-  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
-};
-
-/*!
- * The ident structure that describes a source location.
- * The struct is identical to the one in the kmp.h file.
- * We maintain the same data structure for compatibility.
- */
-typedef short kmp_int16;
-typedef int kmp_int32;
-typedef struct ident {
-  kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
-  kmp_int32 flags;      /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
-                           identifies this union member  */
-  kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
-  kmp_int32 reserved_3; /**<  source[4] in Fortran, do not use for C++  */
-  char const *psource;  /**<  String describing the source location.
-                        The string is composed of semi-colon separated fields
-                        which describe the source file, the function and a pair
-                        of line numbers that delimit the construct. */
-} ident_t;
-
-// parallel defs
-typedef ident_t kmp_Ident;
-typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
-typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
-                                        int16_t lane_offset,
-                                        int16_t shortCircuit);
-typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
-
-// task defs
-typedef struct kmp_TaskDescr kmp_TaskDescr;
-typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
-typedef struct kmp_TaskDescr {
-  void *sharedPointerTable;   // ptr to a table of shared var ptrs
-  kmp_TaskFctPtr sub;         // task subroutine
-  int32_t partId;             // unused
-  kmp_TaskFctPtr destructors; // destructor of c++ first private
-} kmp_TaskDescr;
-
-// sync defs
-typedef int32_t kmp_CriticalName[8];
-
-////////////////////////////////////////////////////////////////////////////////
-// external interface
-////////////////////////////////////////////////////////////////////////////////
-
-// parallel
-EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
-NOINLINE EXTERN uint8_t __kmpc_parallel_level();
-
-// proc bind
-EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid,
-                                  int proc_bind);
-EXTERN int omp_get_num_places(void);
-EXTERN int omp_get_place_num_procs(int place_num);
-EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
-EXTERN int omp_get_place_num(void);
-EXTERN int omp_get_partition_num_places(void);
-EXTERN void omp_get_partition_place_nums(int *place_nums);
-
-// for static (no chunk or chunk)
-EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t sched, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk);
-EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t sched, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk);
-EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t sched, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk);
-EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t sched, int32_t *plastiter1,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk);
-// distribute static (no chunk or chunk)
-EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
-                                            int32_t sched, int32_t *plastiter,
-                                            int32_t *plower, int32_t *pupper,
-                                            int32_t *pstride, int32_t incr,
-                                            int32_t chunk);
-EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t sched, int32_t *plastiter,
-                                             uint32_t *plower, uint32_t *pupper,
-                                             int32_t *pstride, int32_t incr,
-                                             int32_t chunk);
-EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
-                                            int32_t sched, int32_t *plastiter,
-                                            int64_t *plower, int64_t *pupper,
-                                            int64_t *pstride, int64_t incr,
-                                            int64_t chunk);
-EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t sched, int32_t *plastiter1,
-                                             uint64_t *plower, uint64_t *pupper,
-                                             int64_t *pstride, int64_t incr,
-                                             int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t sched, int32_t *plastiter,
-                                          int32_t *plower, int32_t *pupper,
-                                          int32_t *pstride, int32_t incr,
-                                          int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t sched, int32_t *plastiter,
-                                           uint32_t *plower, uint32_t *pupper,
-                                           int32_t *pstride, int32_t incr,
-                                           int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t sched, int32_t *plastiter,
-                                          int64_t *plower, int64_t *pupper,
-                                          int64_t *pstride, int64_t incr,
-                                          int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t sched, int32_t *plastiter1,
-                                           uint64_t *plower, uint64_t *pupper,
-                                           int64_t *pstride, int64_t incr,
-                                           int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t sched, int32_t *plastiter,
-                                             int32_t *plower, int32_t *pupper,
-                                             int32_t *pstride, int32_t incr,
-                                             int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_4u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
-    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
-    int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
-                                             int32_t sched, int32_t *plastiter,
-                                             int64_t *plower, int64_t *pupper,
-                                             int64_t *pstride, int64_t incr,
-                                             int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_8u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
-    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
-    int64_t chunk);
-
-EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid);
-
-EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid);
-
-// for dynamic
-EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t sched, int32_t lower, int32_t upper,
-                                   int32_t incr, int32_t chunk);
-EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                    int32_t sched, uint32_t lower,
-                                    uint32_t upper, int32_t incr,
-                                    int32_t chunk);
-EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t sched, int64_t lower, int64_t upper,
-                                   int64_t incr, int64_t chunk);
-EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                    int32_t sched, uint64_t lower,
-                                    uint64_t upper, int64_t incr,
-                                    int64_t chunk);
-
-EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid,
-                                  int32_t *plastiter, int32_t *plower,
-                                  int32_t *pupper, int32_t *pstride);
-EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t *plastiter, uint32_t *plower,
-                                   uint32_t *pupper, int32_t *pstride);
-EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid,
-                                  int32_t *plastiter, int64_t *plower,
-                                  int64_t *pupper, int64_t *pstride);
-EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t *plastiter, uint64_t *plower,
-                                   uint64_t *pupper, int64_t *pstride);
-
-EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
-
-// reduction
-EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
-EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
-    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
-    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
-    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
-    kmp_ListGlobalFctPtr glredFct);
-EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-
-// sync barrier
-EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
-EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
-
-// single
-EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid);
-
-// sync
-EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
-                            kmp_CriticalName *crit);
-EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
-                                kmp_CriticalName *crit);
-EXTERN void __kmpc_flush(kmp_Ident *loc);
-
-// vote
-EXTERN uint64_t __kmpc_warp_active_thread_mask(void);
-// syncwarp
-EXTERN void __kmpc_syncwarp(uint64_t);
-
-// tasks
-EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid,
-                                            int32_t flag,
-                                            size_t sizeOfTaskInclPrivate,
-                                            size_t sizeOfSharedTable,
-                                            kmp_TaskFctPtr sub);
-EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
-                               kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newLegacyTaskDescr,
-                                         int32_t depNum, void *depList,
-                                         int32_t noAliasDepNum,
-                                         void *noAliasDepList);
-EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
-                                      kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
-                                 int32_t depNum, void *depList,
-                                 int32_t noAliasDepNum, void *noAliasDepList);
-EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid);
-EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid);
-EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
-                                    int end_part);
-EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid);
-EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
-                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
-                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
-                            int32_t sched, uint64_t grainsize, void *task_dup);
-
-// cancel
-EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
-                                        int32_t cancelVal);
-EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
-                             int32_t cancelVal);
-
-// non standard
-EXTERN int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
-                                  bool UseGenericStateMachine,
-                                  bool RequiresFullRuntime);
-EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
-                                 bool RequiresFullRuntime);
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           int32_t NumThreadsClause);
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
-EXTERN void __kmpc_kernel_end_parallel();
-
-EXTERN void __kmpc_data_sharing_init_stack();
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
-EXTERN void __kmpc_end_sharing_variables();
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
-
-/// Entry point to start a new parallel region.
-///
-/// \param ident       The source identifier.
-/// \param global_tid  The global thread ID.
-/// \param if_expr     The if(expr), or 1 if none given.
-/// \param num_threads The num_threads(expr), or -1 if none given.
-/// \param proc_bind   The proc_bind, or `proc_bind_default` if none given.
-/// \param fn          The outlined parallel region function.
-/// \param wrapper_fn  The worker wrapper function of fn.
-/// \param args        The pointer array of arguments to fn.
-/// \param nargs       The number of arguments to fn.
-NOINLINE EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
-                                        kmp_int32 if_expr,
-                                        kmp_int32 num_threads, int proc_bind,
-                                        void *fn, void *wrapper_fn, void **args,
-                                        size_t nargs);
-
-// SPMD execution mode interrogation function.
-EXTERN int8_t __kmpc_is_spmd_exec_mode();
-
-/// Return true if the hardware thread id \p Tid represents the OpenMP main
-/// thread in generic mode outside of a parallel region.
-EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);
-
-/// Return true if the hardware thread id \p Tid represents the OpenMP main
-/// thread in generic mode.
-EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid);
-
-EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
-                                          const void *buf, size_t size,
-                                          int16_t is_shared, const void **res);
-
-EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
-                                              int16_t is_shared);
-
-/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
-/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
-/// called by any thread, allocation happens per-thread.
-EXTERN void *__kmpc_alloc_shared(uint64_t Bytes);
-
-/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
-/// a stack (push/pop). Can be called by any thread. \p Ptr must be allocated by
-/// __kmpc_alloc_shared by the same thread. \p Bytes contains the size of the
-/// paired allocation to make memory management easier.
-EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes);
-
-/// Get a pointer to the dynamic shared memory buffer in the device.
-EXTERN void *__kmpc_get_dynamic_shared();
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@ -1,257 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
-#
-##===----------------------------------------------------------------------===##
-
-# By default we will build NVPTX deviceRTL on a CUDA free system
-set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
-  "Whether build NVPTX deviceRTL on CUDA free system.")
-
-if (NOT LIBOMPTARGET_BUILD_NVPTX_BCLIB)
-  libomptarget_say("Not building NVPTX deviceRTL: Disabled by LIBOMPTARGET_BUILD_NVPTX_BCLIB")
-  return()
-endif()
-
-if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
-  libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
-  return()
-endif()
-
-# Check if we can create an LLVM bitcode implementation of the runtime library
-# that could be inlined in the user application. For that we need to find
-# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
-# an LLVM linker.
-set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
-  "Location of a CUDA compiler capable of emitting LLVM bitcode.")
-set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
-  "Location of a linker capable of linking LLVM bitcode objects.")
-
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
-  set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING)
-  # Compile the deviceRTL with the clang that is built in the project.
-  set(cuda_compiler "$<TARGET_FILE:clang>")
-elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
-  # Compile the device runtime with the compiler that OpenMP is built with.
-  # This is the case with LLVM_ENABLE_RUNTIMES=openmp.
-  # FIXME: This is unreliable; the compiler can be on older version of clang
-  # that does not support compiling CUDA, or only an older version of it. The
-  # risk is especially high on sytems where clang is the default compiler
-  # (MacOS, BSDs). LLVM_ENABLE_RUNTIMES=openmp should itself set
-  # LIBOMPTARGET_NVPTX_CUDA_COMPILER instead.
-  set(cuda_compiler ${CMAKE_C_COMPILER})
-else()
-  libomptarget_say("Not building NVPTX deviceRTL: clang not found")
-  return()
-endif()
-
-# Get compiler directory to try to locate a suitable linker.
-get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
-set(llvm_link "${compiler_dir}/llvm-link")
-
-if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
-  set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
-elseif (EXISTS ${llvm_link})
-  # Try to use the linker consistent with the CUDA compiler unless explicitly
-  # set to a different linker.
-  set(bc_linker ${llvm_link})
-elseif (NOT OPENMP_STANDALONE_BUILD AND NOT CMAKE_CROSSCOMPILING)
-  # Use the linker also built in the same project.
-  set(bc_linker "$<TARGET_FILE:llvm-link>")
-else()
-  libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
-  return()
-endif()
-
-# TODO: This part needs to be refined when libomptarget is going to support
-# Windows!
-# TODO: This part can also be removed if we can change the clang driver to make
-# it support device only compilation.
-if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
-  set(aux_triple x86_64-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
-  set(aux_triple powerpc64le-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
-  set(aux_triple aarch64-unknown-linux-gnu)
-else()
-  libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-  return()
-endif()
-
-get_filename_component(devicertl_base_directory
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  DIRECTORY)
-set(devicertl_common_directory
-  ${devicertl_base_directory}/common)
-set(devicertl_nvptx_directory
-  ${devicertl_base_directory}/nvptx)
-
-set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80 86)
-
-set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
-  "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
-string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
-
-if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
-  set(nvptx_sm_list ${all_capabilities})
-elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
-  if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
-    libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
-  endif()
-  set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
-else()
-  string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
-endif()
-
-# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
-# build.
-if (NOT nvptx_sm_list)
-  libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
-  return()
-endif()
-
-# Check all SM values
-foreach(sm ${nvptx_sm_list})
-  if (NOT ${sm} IN_LIST all_capabilities)
-    libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
-  endif()
-endforeach()
-
-# Override default MAX_SM in src/target_impl.h if requested
-if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
-  set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
-endif()
-
-# Activate RTL message dumps if requested by the user.
-set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
-  "Activate NVPTX device RTL debug messages.")
-
-if ("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
-  libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using in-tree clang.")
-else ()
-  libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using ${cuda_compiler}")
-endif ()
-
-set(cuda_src_files
-  ${devicertl_common_directory}/src/cancel.cu
-  ${devicertl_common_directory}/src/critical.cu
-  ${devicertl_common_directory}/src/data_sharing.cu
-  ${devicertl_common_directory}/src/libcall.cu
-  ${devicertl_common_directory}/src/loop.cu
-  ${devicertl_common_directory}/src/omp_data.cu
-  ${devicertl_common_directory}/src/omptarget.cu
-  ${devicertl_common_directory}/src/parallel.cu
-  ${devicertl_common_directory}/src/reduction.cu
-  ${devicertl_common_directory}/src/support.cu
-  ${devicertl_common_directory}/src/sync.cu
-  ${devicertl_common_directory}/src/task.cu
-  ${devicertl_common_directory}/src/shuffle.cpp
-  src/target_impl.cu
-)
-
-# Prepend -I to each list element
-set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
-list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I")
-
-# Set flags for LLVM Bitcode compilation.
-set(bc_flags -S -x c++ -O1 -std=c++14
-             -mllvm -openmp-opt-disable
-             -ffreestanding
-             -target nvptx64
-             -fvisibility=hidden
-             -Xclang -emit-llvm-bc
-             -Xclang -aux-triple -Xclang ${aux_triple}
-             -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-             -Xclang -target-feature -Xclang +ptx61
-             -D__CUDACC__
-             -I${devicertl_base_directory}
-             -I${devicertl_common_directory}/include
-             -I${devicertl_nvptx_directory}/src
-             -I${devicertl_base_directory}/../include
-             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX})
-
-if(${LIBOMPTARGET_NVPTX_DEBUG})
-  list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g)
-else()
-  list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
-endif()
-
-# Create target to build all Bitcode libraries.
-add_custom_target(omptarget-nvptx-bc)
-
-# Generate a Bitcode library for all the compute capabilities the user requested
-foreach(sm ${nvptx_sm_list})
-  set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-  set(bc_files "")
-  foreach(src ${cuda_src_files})
-    get_filename_component(infile ${src} ABSOLUTE)
-    get_filename_component(outfile ${src} NAME)
-    set(outfile "${outfile}-sm_${sm}.bc")
-
-    add_custom_command(OUTPUT ${outfile}
-      COMMAND ${cuda_compiler} ${bc_flags}
-        ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
-      DEPENDS ${infile}
-      IMPLICIT_DEPENDS CXX ${infile}
-      COMMENT "Building LLVM bitcode ${outfile}"
-      VERBATIM
-    )
-    if("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
-      # Add a file-level dependency to ensure that clang is up-to-date.
-      # By default, add_custom_command only builds clang if the
-      # executable is missing.
-      add_custom_command(OUTPUT ${outfile}
-        DEPENDS clang
-        APPEND
-      )
-    endif()
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-    list(APPEND bc_files ${outfile})
-  endforeach()
-
-  set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
-
-  # Link to a bitcode library.
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-      COMMAND ${bc_linker}
-        -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
-      DEPENDS ${bc_files}
-      COMMENT "Linking LLVM bitcode ${bclib_name}"
-  )
-  if("${bc_linker}" STREQUAL "$<TARGET_FILE:llvm-link>")
-    # Add a file-level dependency to ensure that llvm-link is up-to-date.
-    # By default, add_custom_command only builds llvm-link if the
-    # executable is missing.
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-      DEPENDS llvm-link
-      APPEND
-    )
-  endif()
-  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
-
-  set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
-
-  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
-  add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
-
-  # Copy library to destination.
-  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-                    ${LIBOMPTARGET_LIBRARY_DIR})
-
-  # Install bitcode library under the lib destination folder.
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-endforeach()
-
-# Test will be enabled if the building machine supports CUDA
-if (LIBOMPTARGET_DEP_CUDA_FOUND)
-  add_subdirectory(test)
-endif()
--- a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
@ -1,523 +0,0 @@
-
-**Design document for OpenMP reductions on the GPU** 
-
-//Abstract: //In this document we summarize the new design for an OpenMP
-implementation of reductions on NVIDIA GPUs.  This document comprises
-* a succinct background review,
-* an introduction to the decoupling of reduction algorithm and
-    data-structure-specific processing routines,
-* detailed illustrations of reduction algorithms used and
-* a brief overview of steps we have made beyond the last implementation.
-
-**Problem Review**
-
-Consider a typical OpenMP program with reduction pragma.
-
-```
-    double foo, bar;
-    #pragma omp parallel for reduction(+:foo, bar)
-    for (int i = 0; i < N; i++) {
-      foo+=A[i]; bar+=B[i];
-    }
-```
-where 'foo' and 'bar' are reduced across all threads in the parallel region.
-Our primary goal is to efficiently aggregate the values of foo and bar in
-such manner that
-* makes the compiler logically concise.
-* efficiently reduces within warps, threads, blocks and the device.
-
-**Introduction to Decoupling**
-In this section we address the problem of making the compiler
-//logically concise// by partitioning the task of reduction into two broad
-categories: data-structure specific routines and algorithmic routines.
-
-The previous reduction implementation was highly coupled with
-the specificity of the reduction element data structures (e.g., sizes, data
-types) and operators of the reduction (e.g., addition, multiplication). In
-our implementation we strive to decouple them. In our final implementations,
-we could remove all template functions in our runtime system.
-
-The (simplified) pseudo code generated by LLVM is as follows:
-
-```
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, 
-               interWarpCpyFn)
-        where:
-        struct ReduceData {
-          double *foo;
-          double *bar;
-        } reduceData
-        reduceData.foo = &foo_p
-        reduceData.bar = &bar_p
-
-        shuffleReduceFn and interWarpCpyFn are two auxiliary functions
-        generated to aid the runtime performing algorithmic steps
-        while being data-structure agnostic about ReduceData.
-
-        In particular, shuffleReduceFn is a function that takes the following
-        inputs:
-        a. local copy of ReduceData
-        b. its lane_id
-        c. the offset of the lane_id which hosts a remote ReduceData
-                relative to the current one
-        d. an algorithm version parameter determining which reduction
-                algorithm to use.
-        This shuffleReduceFn retrieves the remote ReduceData through shuffle
-        intrinsics and  reduces, using the algorithm specified by the 4th
-        parameter, the local ReduceData and with the remote ReduceData element
-        wise, and places the resultant values into the local ReduceData.
-
-        Different reduction algorithms are implemented with different runtime
-        functions, but they all make calls to this same shuffleReduceFn to
-        perform the essential reduction step. Therefore, based on the 4th
-        parameter, this shuffleReduceFn will behave slightly differently to
-        cooperate with the runtime function to ensure correctness under
-        different circumstances.
-
-        InterWarpCpyFn, as the name suggests, is a function that copies data
-        across warps. Its function is to tunnel all the thread private
-        ReduceData that is already reduced within a warp to a lane in the first
-        warp with minimal shared memory footprint. This is an essential step to
-        prepare for the last step of a block reduction.
-
-        (Warp, block, device level reduction routines that utilize these
-        auxiliary functions will be discussed in the next section.)
-
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-```
-
-**Reduction Algorithms**
-
-On the warp level, we have three versions of the algorithms:
-
-1. Full Warp Reduction
-
-```
-gpu_regular_warp_reduce(void *reduce_data,
-                        kmp_ShuffleReductFctPtr ShuffleReduceFn) {
-  for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
-    ShuffleReduceFn(reduce_data, 0, offset, 0);
-}
-```
-ShuffleReduceFn is used here with lane_id set to 0 because it is not used
-therefore we save instructions by not retrieving lane_id from the corresponding
-special registers. The 4th parameters, which represents the version of the
-algorithm being used here, is set to 0 to signify full warp reduction.
-
-In this version specified (=0), the ShuffleReduceFn behaves, per element, as
-follows:
-
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-reduce_elem = reduce_elem @ remote_elem;
-
-```
-
-An illustration of this algorithm operating on a hypothetical 8-lane full-warp
-would be:
-{F74}
-The coloring invariant follows that elements with the same color will be
-combined and reduced in the next reduction step. As can be observed, no overhead
-is present, exactly log(2, N) steps are needed.
-
-2. Contiguous Full Warp Reduction
-```
-gpu_irregular_warp_reduce(void *reduce_data,
-                          kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
-                          int lane_id) {
-  int curr_size;
-  int offset;
-    curr_size = size;
-    mask = curr_size/2;
-    while (offset>0) {
-      ShuffleReduceFn(reduce_data, lane_id, offset, 1);
-      curr_size = (curr_size+1)/2;
-      offset = curr_size/2;
-    }
-}
-```
-
-In this version specified (=1), the ShuffleReduceFn behaves, per element, as
-follows:
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-if (lane_id < offset) {
-    reduce_elem = reduce_elem @ remote_elem
-} else {
-    reduce_elem = remote_elem
-}
-```
-
-An important invariant (also a restriction on the starting state of the
-reduction) is that this algorithm assumes that all unused ReduceData are
-located in a contiguous subset of threads in a warp starting from lane 0.
-
-With the presence of a trailing active lane with an odd-numbered lane
-id, its value will not be aggregated with any other lane. Therefore,
-in order to preserve the invariant, such ReduceData is copied to the first lane
-whose thread-local ReduceData has already being used in a previous reduction
-and would therefore be useless otherwise.
-
-An illustration of this algorithm operating on a hypothetical 8-lane partial
-warp woud be:
-{F75}
-
-As illustrated, this version of the algorithm introduces overhead whenever
-we have odd number of participating lanes in any reduction step to
-copy data between lanes.
-
-3. Dispersed Partial Warp Reduction
-```
-gpu_irregular_simt_reduce(void *reduce_data,
-                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
-  int size, remote_id;
-  int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
-  do {
-      remote_id = find_the_next_active_lane_id_right_after_me();
-      // the above function returns 0 of no active lane
-      // is present right after the current thread.
-      size = get_number_of_active_lanes_in_this_warp();
-      logical_lane_id /= 2;
-      ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-```
-
-There is no assumption made about the initial state of the reduction.
-Any number of lanes (>=1) could be active at any position. The reduction
-result is kept in the first active lane.
-
-In this version specified (=2), the ShuffleReduceFn behaves, per element, as
-follows:
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-if (LaneId % 2 == 0 && Offset > 0) {
-    reduce_elem = reduce_elem @ remote_elem
-} else {
-    reduce_elem = remote_elem
-}
-```
-We will proceed with a brief explanation for some arguments passed in,
-it is important to notice that, in this section, we will introduce the
-concept of logical_lane_id, and it is important to distinguish it
-from physical lane_id as defined by nvidia.
-1. //logical_lane_id//: as the name suggests, it refers to the calculated
-    lane_id (instead of the physical one defined by nvidia) that would make
-    our algorithm logically concise. A thread with logical_lane_id k means
-    there are (k-1) threads before it.
-2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
-    id of the remote lane from which we will retrieve the ReduceData. We
-    subtract (threadIdx+1) from it because we would like to maintain only one
-    underlying shuffle intrinsic (which is used to communicate among lanes in a
-    warp). This particular version of shuffle intrinsic we take accepts only
-    offsets, instead of absolute lane_id. Therefore the subtraction is performed
-    on the absolute lane_id we calculated to obtain the offset.
-
-This algorithm is slightly different in 2 ways and it is not, conceptually, a
-generalization of the above algorithms.
-1. It reduces elements close to each other. For instance, values in the 0th lane
-    is to be combined with that of the 1st lane; values in the 2nd lane is to be
-    combined with that of the 3rd lane. We did not use the previous algorithm
-    where the first half of the (partial) warp is reduced with the second half
-    of the (partial) warp. This is because, the mapping
-    f(x): logical_lane_id -> physical_lane_id;
-    can be easily calculated whereas its inverse
-    f^-1(x): physical_lane_id -> logical_lane_id
-    cannot and performing such reduction requires the inverse to be known.
-2. Because this algorithm is agnostic about the positions of the lanes that are
-    active, we do not need to perform the coping step as in the second
-    algorithm.
-An illustrative run would look like
-{F76}
-As observed, overhead is high because in each and every step of reduction,
-logical_lane_id is recalculated; so is the remote_id.
-
-On a block level, we have implemented the following block reduce algorithm:
-
-```
-gpu_irregular_block_reduce(void *reduce_data,
-              kmp_ShuffleReductFctPtr shuflReduceFn,
-              kmp_InterWarpCopyFctPtr interWarpCpyFn,
-              int size) {
-
-  int wid = threadIdx.x/WARPSIZE;
-  int lane_id = threadIdx.x%WARPSIZE;
-
-  int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
-
-  unsigned tnum = __ballot(1);
-  int thread_num = __popc(tnum);
-
-    //full warp reduction
-    if (thread_num == WARPSIZE) {
-      gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
-    }
-    //partial warp reduction
-    if (thread_num < WARPSIZE) {
-        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
-                                  lane_id);
-    }
-    //Gather all the reduced values from each warp
-    //to the first warp
-    //named_barrier inside this function to ensure
-    //correctness. It is effectively a sync_thread
-    //that won't deadlock.
-    interWarpCpyFn(reduce_data, warp_needed);
-
-    //This is to reduce data gathered from each "warp master".
-    if (wid==0) {
-        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
-                                  lane_id);
-    }
-
-  return;
-}
-```
-In this function, no ShuffleReduceFn is directly called as it makes calls
-to various versions of the warp-reduction functions. It first reduces
-ReduceData warp by warp; in the end, we end up with the number of
-ReduceData equal to the number of warps present in this thread
-block. We then proceed to gather all such ReduceData to the first warp.
-
-As observed, in this algorithm we make use of the function InterWarpCpyFn,
-which copies data from each of the "warp master" (0th lane of each warp, where 
-a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
-mathematical sense) the problem of reduction across warp masters in a block to
-the problem of warp reduction which we already have solutions to.
-
-We can thus completely avoid the use of atomics to reduce in a threadblock.
-
-**Efficient Cross Block Reduce**
-
-The next challenge is to reduce values across threadblocks.  We aim to do this
-without atomics or critical sections.
-
-Let a kernel be started with TB threadblocks.
-Let the GPU have S SMs.
-There can be at most N active threadblocks per SM at any time.
-
-Consider a threadblock tb (tb < TB) running on SM s (s < SM).  'tb' is one of
-at most 'N' active threadblocks on SM s.  Let each threadblock active on an SM
-be given an instance identifier id (0 <= id < N).  Therefore, the tuple (s, id)
-uniquely identifies an active threadblock on the GPU.
-
-To efficiently implement cross block reduce, we first allocate an array for
-each value to be reduced of size S*N (which is the maximum number of active
-threadblocks at any time on the device).
-
-Each threadblock reduces its value to slot [s][id].  This can be done without
-locking since no other threadblock can write to the same slot concurrently.
-
-As a final stage, we reduce the values in the array as follows:
-
-```
-// Compiler generated wrapper function for each target region with a reduction
-clause.
-target_function_wrapper(map_args, reduction_array)  <--- start with 1 team and 1
-   thread.
-  // Use dynamic parallelism to launch M teams, N threads as requested by the
-  user to execute the target region.
-
-  target_function<<M, N>>(map_args)
-
-  Reduce values in reduction_array
-
-```
-
-**Comparison with Last Version**
-
-
-The (simplified) pseudo code generated by LLVM on the host is as follows:
-
-
-```
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
-        where:
-        struct ReduceData {
-          double *foo;
-          double *bar;
-        } reduceData
-        reduceData.foo = &foo_p
-        reduceData.bar = &bar_p
-
-        reduceFn is a pointer to a function that takes in two inputs
-        of type ReduceData, "reduces" them element wise, and places the
-        result in the first input:
-        reduceFn(ReduceData *a, ReduceData *b)
-          a = a @ b
-
-        Every thread in the parallel region calls kmpc_reduce_nowait with
-        its private copy of reduceData.  The runtime reduces across the
-        threads (using tree reduction on the operator 'reduceFn?) and stores
-        the final result in the master thread if successful.
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-    5. else if ret == 2:
-        In this case kmpc_reduce_nowait() could not use tree reduction,
-        so use atomics instead:
-        each thread atomically writes to foo
-        each thread atomically writes to bar
-```
-
-On a GPU, a similar reduction may need to be performed across SIMT threads,
-warps, and threadblocks.  The challenge is to do so efficiently in a fashion
-that is compatible with the LLVM OpenMP implementation.
-
-In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
-the salient steps of the code generated are as follows:
-
-
-```
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
-        status = can_block_reduce()
-        if status == 1:
-          reduce efficiently to thread 0 using shuffles and shared memory.
-          return 1
-        else
-          cannot use efficient block reduction, fallback to atomics
-          return 2
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-    5. else if ret == 2:
-        In this case kmpc_reduce_nowait() could not use tree reduction,
-        so use atomics instead:
-        each thread atomically writes to foo
-        each thread atomically writes to bar
-```
-
-The function can_block_reduce() is defined as follows:
-
-
-```
-int32_t can_block_reduce() {
-  int tid = GetThreadIdInTeam();
-  int nt = GetNumberOfOmpThreads(tid);
-  if (nt != blockDim.x)
-    return 0;
-  unsigned tnum = __ballot(1);
-  if (tnum != (~0x0)) {
-    return 0;
-  }
-  return 1;
-}
-```
-
-This function permits the use of the efficient block reduction algorithm
-using shuffles and shared memory (return 1) only if (a) all SIMT threads in
-a warp are active (i.e., number of threads in the parallel region is a
-multiple of 32) and (b) the number of threads in the parallel region
-(set by the num_threads clause) equals blockDim.x.
-
-If either of these preconditions is not true, each thread in the threadblock
-updates the global value using atomics.
-
-Atomics and compare-and-swap operations are expensive on many threaded
-architectures such as GPUs and we must avoid them completely.
-
-
-**Appendix: Implementation Details**
-
-
-```
-// Compiler generated function.
-reduceFn(ReduceData *a, ReduceData *b)
-  a->foo = a->foo + b->foo
-  a->bar = a->bar + b->bar
-
-// Compiler generated function.
-swapAndReduceFn(ReduceData *thread_private, int lane)
-  ReduceData *remote = new ReduceData()
-  remote->foo = shuffle_double(thread_private->foo, lane)
-  remote->bar = shuffle_double(thread_private->bar, lane)
-  reduceFn(thread_private, remote)
-
-// OMP runtime function.
-warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
-  offset = 16
-  while (offset > 0)
-    swapAndReduceFn(thread_private, offset)
-    offset /= 2
-
-// OMP runtime function.
-warpReduce_irregular():
-  ...
-
-// OMP runtime function.
-kmpc_reduce_warp(reduceData, swapAndReduceFn)
-  if all_lanes_active:
-    warpReduce_regular(reduceData, swapAndReduceFn)
-  else:
-    warpReduce_irregular(reduceData, swapAndReduceFn)
-  if in_simd_region:
-    // all done, reduce to global in simd lane 0
-    return 1
-  else if in_parallel_region:
-    // done reducing to one value per warp, now reduce across warps
-    return 3
-
-// OMP runtime function; one for each basic type.
-kmpc_reduce_block_double(double *a)
-  if lane == 0:
-    shared[wid] = *a
-  named_barrier(1, num_threads)
-  if wid == 0
-    block_reduce(shared)
-  if lane == 0
-    *a = shared[0]
-  named_barrier(1, num_threads)
-  if wid == 0 and lane == 0
-    return 1  // write back reduced result
-  else
-    return 0  // don't do anything
-
-```
-
-
-
-```
-// Compiler generated code.
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-    5. else if ret == 3:
-        ret = block_reduce_double(reduceData.foo)
-        if ret == 1:
-          foo += reduceData.foo
-        ret = block_reduce_double(reduceData.bar)
-        if ret == 1:
-          bar += reduceData.bar
-```
-
-**Notes**
-
-    1. This scheme requires that the CUDA OMP runtime can call llvm generated
-       functions. This functionality now works.
-    2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
-       (including calls through function pointers) are optimized away.
-    3. If we are reducing multiple to multiple variables in a parallel region,
-       the reduce operations are all performed in warpReduce_[ir]regular(). This
-       results in more instructions in the loop and should result in fewer
-       stalls due to data dependencies.  Unfortunately we cannot do the same in
-       kmpc_reduce_block_double() without increasing shared memory usage.
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@ -1,17 +0,0 @@
-//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _NVPTX_INTERFACE_H_
-#define _NVPTX_INTERFACE_H_
-
-#include <stdint.h>
-
-#define EXTERN extern "C"
-typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@ -1,198 +0,0 @@
-//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "target_impl.h"
-#include "target_interface.h"
-
-EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-}
-
-EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
-  uint64_t val;
-  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-  return val;
-}
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
-  return res;
-}
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
-  return res;
-}
-
-EXTERN uint32_t __kmpc_impl_smid() {
-  uint32_t id;
-  asm("mov.u32 %0, %%smid;" : "=r"(id));
-  return id;
-}
-
-EXTERN double __kmpc_impl_get_wtick() {
-  // Timer precision is 1ns
-  return ((double)1E-9);
-}
-
-EXTERN double __kmpc_impl_get_wtime() {
-  unsigned long long nsecs;
-  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
-  return (double)nsecs * __kmpc_impl_get_wtick();
-}
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-  unsigned int Mask;
-  asm volatile("activemask.b32 %0;" : "=r"(Mask));
-  return Mask;
-}
-
-EXTERN void __kmpc_impl_syncthreads() {
-  int barrier = 2;
-  asm volatile("barrier.sync %0;"
-               :
-               : "r"(barrier)
-               : "memory");
-}
-
-EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-  __nvvm_bar_warp_sync(Mask);
-}
-
-// NVPTX specific kernel initialization
-EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
-}
-
-// Barrier until num_threads arrive.
-EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
-  // The named barrier for active parallel threads of a team in an L1 parallel
-  // region to synchronize with each other.
-  int barrier = 1;
-  asm volatile("barrier.sync %0, %1;"
-               :
-               : "r"(barrier), "r"(num_threads)
-               : "memory");
-}
-
-EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
-EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
-EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
-
-// Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int __kmpc_get_hardware_thread_id_in_block() {
-  return __nvvm_read_ptx_sreg_tid_x();
-}
-EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-EXTERN int __kmpc_get_hardware_num_blocks() {
-  return __nvvm_read_ptx_sreg_nctaid_x();
-}
-EXTERN int __kmpc_get_hardware_num_threads_in_block() {
-  return __nvvm_read_ptx_sreg_ntid_x();
-}
-EXTERN unsigned __kmpc_get_warp_size() { return WARPSIZE; }
-EXTERN unsigned GetWarpId() {
-  return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
-}
-EXTERN unsigned GetLaneId() {
-  return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
-}
-
-// Atomics
-uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
-  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
-  return __nvvm_atom_inc_gen_ui(Address, Val);
-}
-
-uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
-  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
-  uint32_t R;
-  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
-  return R;
-}
-
-uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
-  (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
-                                  __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-  return Compare;
-}
-
-unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
-                                          unsigned long long Val) {
-  unsigned long long R;
-  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
-  return R;
-}
-
-unsigned long long __kmpc_atomic_add(unsigned long long *Address,
-                                     unsigned long long Val) {
-  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-#define __OMP_SPIN 1000
-#define UNSET 0u
-#define SET 1u
-
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
-  __kmpc_impl_unset_lock(lock);
-}
-
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
-  __kmpc_impl_unset_lock(lock);
-}
-
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
-  // TODO: not sure spinning is a good idea here..
-  while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
-    int32_t start = __nvvm_read_ptx_sreg_clock();
-    int32_t now;
-    for (;;) {
-      now = __nvvm_read_ptx_sreg_clock();
-      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
-        break;
-      }
-    }
-  } // wait for 0 to be the read value
-}
-
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
-  (void)__kmpc_atomic_exchange(lock, UNSET);
-}
-
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
-  return __kmpc_atomic_add(lock, 0u);
-}
-
-extern "C" {
-void *malloc(size_t);
-void free(void *);
-int32_t vprintf(const char *, void *);
-}
-
-EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
-EXTERN void __kmpc_impl_free(void *x) { free(x); }
-
-EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
-                                  uint32_t) {
-  return vprintf(Format, Arguments);
-}
-
-#pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@ -1,89 +0,0 @@
-//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#ifndef _TARGET_IMPL_H_
-#define _TARGET_IMPL_H_
-
-#include "nvptx_interface.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-// subset of inttypes.h
-#define PRId64 "ld"
-#define PRIu64 "lu"
-
-typedef uint32_t __kmpc_impl_lanemask_t;
-
-#define INLINE inline __attribute__((always_inline))
-#define NOINLINE __attribute__((noinline))
-#define ALIGN(N) __attribute__((aligned(N)))
-#define PLUGIN_ACCESSIBLE /* no annotation needed for cuda plugin */
-
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-
-INLINE constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::NVPTXGridValues;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
-enum { WARPSIZE = getGridValue().GV_Warp_Size };
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#if __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
-#else
-#define OMP_STATE_COUNT 16
-#endif
-
-#if !defined(MAX_SM)
-#if __CUDA_ARCH__ >= 900
-#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
-#elif __CUDA_ARCH__ >= 800
-// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
-// GA102 design has a maxinum of 84 SMs
-#define MAX_SM 108
-#elif __CUDA_ARCH__ >= 700
-#define MAX_SM 84
-#elif __CUDA_ARCH__ >= 600
-#define MAX_SM 56
-#else
-#define MAX_SM 16
-#endif
-#endif
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
-  // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = getGridValue().GV_Slot_Size,
-  // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
-  // The maximum number of warps in use
-  DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
-};
-
-enum : __kmpc_impl_lanemask_t {
-  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
-};
-
-#define printf(...)
-
-#endif
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
@ -1,25 +0,0 @@
-if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
-  # Silently return, no need to annoy the user.
-  return()
-endif()
-
-set(deps omptarget omp)
-if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
-  set(deps ${deps} omptarget-nvptx-bc)
-endif()
-
-# Run with only one thread to only launch one application to the GPU at a time.
-add_openmp_testsuite(check-libomptarget-nvptx
-    "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
-    EXCLUDE_FROM_CHECK_ALL
-    DEPENDS ${deps} ARGS -j1)
-
-set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
-    "Extra compiler flags to send to the test compiler.")
-set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS
-    "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING
-    "OpenMP compiler flags to use for testing libomptarget-nvptx.")
-
-# Configure the lit.site.cfg.in file
-set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!")
-configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
@ -1,22 +0,0 @@
-// RUN: %compile-run-and-check
-#include <omp.h>
-#include <stdio.h>
-
-int main(){
-  int max_threads = -1;
-  int num_threads = -1;
-
-  #pragma omp target map(tofrom: max_threads)
-    max_threads = omp_get_max_threads();
-
-  #pragma omp target parallel map(tofrom: num_threads)
-  {
-    #pragma omp master
-      num_threads = omp_get_num_threads();
-  }
-  
-  // CHECK: Max Threads: 128, Num Threads: 128
-  printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads);
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
@ -1,38 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
-  int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1;
-
-  #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels)
-  {
-    // libomptarget-nvptx doesn't support cancellation.
-    cancellation = omp_get_cancellation();
-
-    // No support for dynamic adjustment of the number of threads.
-    omp_set_dynamic(1);
-    dynamic = omp_get_dynamic();
-
-    // libomptarget-nvptx doesn't support nested parallelism.
-    omp_set_nested(1);
-    nested = omp_get_nested();
-
-    omp_set_max_active_levels(42);
-    maxActiveLevels = omp_get_max_active_levels();
-  }
-
-  // CHECK: cancellation = 0
-  printf("cancellation = %d\n", cancellation);
-  // CHECK: dynamic = 0
-  printf("dynamic = %d\n", dynamic);
-  // CHECK: nested = 0
-  printf("nested = %d\n", nested);
-  // CHECK: maxActiveLevels = 1
-  printf("maxActiveLevels = %d\n", maxActiveLevels);
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
@ -1,53 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int MaxThreadsL1 = -1, MaxThreadsL2 = -1;
-
-#pragma omp declare reduction(unique:int                                       \
-                              : omp_out = (omp_in == 1 ? omp_in : omp_out))    \
-    initializer(omp_priv = -1)
-
-  // Non-SPMD mode.
-#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32)      \
-    num_teams(1)
-  {
-    MaxThreadsL1 = omp_get_max_threads();
-#pragma omp parallel reduction(unique : MaxThreadsL2)
-    { MaxThreadsL2 = omp_get_max_threads(); }
-  }
-
-  //FIXME: This Non-SPMD kernel will have 32 active threads due to
-  //       thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of
-  //       threads in block (64 in this case), which translates to worker
-  //       threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD
-  //       kernels. According to the spec, omp_get_max_threads must return the
-  //       max active threads possible between the two kernel types.
-
-  // CHECK: Non-SPMD MaxThreadsL1 = 64
-  printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
-  // CHECK: Non-SPMD MaxThreadsL2 = 1
-  printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
-
-  // SPMD mode with full runtime
-  MaxThreadsL2 = -1;
-#pragma omp target parallel reduction(unique : MaxThreadsL2)
-  { MaxThreadsL2 = omp_get_max_threads(); }
-
-  // CHECK: SPMD with full runtime MaxThreadsL2 = 1
-  printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
-
-  // SPMD mode without runtime
-  MaxThreadsL2 = -1;
-#pragma omp target parallel for reduction(unique : MaxThreadsL2)
-  for (int I = 0; I < 2; ++I) {
-    MaxThreadsL2 = omp_get_max_threads();
-  }
-
-  // CHECK: SPMD without runtime MaxThreadsL2 = 1
-  printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
@ -1,72 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1;
-
-#pragma omp declare reduction(unique64:int                                     \
-                              : omp_out = (omp_in == 64 ? omp_in : omp_out))   \
-    initializer(omp_priv = -1)
-#pragma omp declare reduction(unique32:int                                     \
-                              : omp_out = (omp_in == 32 ? omp_in : omp_out))   \
-    initializer(omp_priv = -1)
-
-  // Non-SPMD mode.
-#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2)      \
-    thread_limit(64) num_teams(1)
-  {
-    ThreadLimitL0 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique64                                        \
-                               : ThreadLimitL1, ThreadLimitL2) num_threads(32)
-    {
-      ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique64 : ThreadLimitL2)
-      { ThreadLimitL2 = omp_get_thread_limit(); }
-    }
-  }
-
-  // CHECK: Non-SPMD ThreadLimitL0 = 64
-  printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0);
-  // CHECK: Non-SPMD ThreadLimitL1 = 64
-  printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1);
-  // CHECK: Non-SPMD ThreadLimitL2 = 64
-  printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2);
-
-  // SPMD mode with full runtime
-  ThreadLimitL1 = -1;
-  ThreadLimitL2 = -1;
-#pragma omp target parallel reduction(unique32                                 \
-                                      : ThreadLimitL1, ThreadLimitL2)          \
-    num_threads(32)
-  {
-    ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique32 : ThreadLimitL2)
-    { ThreadLimitL2 = omp_get_thread_limit(); }
-  }
-
-  // CHECK: SPMD with full runtime ThreadLimitL1 = 32
-  printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
-  // CHECK: SPMD with full runtime ThreadLimitL2 = 32
-  printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
-
-  // SPMD mode without runtime
-  ThreadLimitL1 = -1;
-  ThreadLimitL2 = -1;
-#pragma omp target parallel for reduction(unique32                             \
-                                          : ThreadLimitL1, ThreadLimitL2)      \
-    num_threads(32)
-  for (int I = 0; I < 2; ++I) {
-    ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique32 : ThreadLimitL2)
-    { ThreadLimitL2 = omp_get_thread_limit(); }
-  }
-
-  // CHECK: SPMD without runtime ThreadLimitL1 = 32
-  printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
-  // CHECK: SPMD without runtime ThreadLimitL2 = 32
-  printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
@ -1,55 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp declare target
-static void putValueInParallel(int *ptr, int value) {
-  #pragma omp parallel
-  {
-    *ptr = value;
-  }
-}
-
-static int getId() {
-  int id;
-  putValueInParallel(&id, omp_get_thread_num());
-  return id;
-}
-#pragma omp end declare target
-
-const int MaxThreads = 1024;
-const int Threads = 64;
-
-int main(int argc, char *argv[]) {
-  int master;
-  int check[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check[i] = 0;
-  }
-
-  #pragma omp target map(master, check[:])
-  {
-    master = getId();
-
-    #pragma omp parallel num_threads(Threads)
-    {
-      check[omp_get_thread_num()] = getId();
-    }
-  }
-
-  // CHECK: master = 0.
-  printf("master = %d.\n", master);
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    if (i < Threads) {
-      if (check[i] != i) {
-        printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]);
-      }
-    } else if (check[i] != 0) {
-      printf("invalid: check[%d] should be 0, is %d\n", i, check[i]);
-    }
-  }
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
@ -1,76 +0,0 @@
-# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
-# Configuration file for the 'lit' test runner.
-
-import os
-import lit.formats
-
-# Tell pylint that we know config and lit_config exist somewhere.
-if 'PYLINT_IMPORT' in os.environ:
-    config = object()
-    lit_config = object()
-
-def prepend_library_path(name, value, sep):
-    if name in config.environment:
-        config.environment[name] = value + sep + config.environment[name]
-    else:
-        config.environment[name] = value
-
-# name: The name of this test suite.
-config.name = 'libomptarget-nvptx'
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc']
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root object directory where output is placed
-config.test_exec_root = config.binary_dir
-
-# test format
-config.test_format = lit.formats.ShTest()
-
-# compiler flags
-config.test_flags = " -I " + config.omp_header_directory + \
-    " -L " + config.library_dir
-
-if config.omp_host_rtl_directory:
-    config.test_flags = config.test_flags + \
-        " -L " + config.omp_host_rtl_directory
-
-config.test_flags = config.test_flags + " " + config.test_extra_flags
-
-# Setup environment to find dynamic library at runtime.
-prepend_library_path('LIBRARY_PATH', config.library_dir, ":")
-prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
-prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
-if config.cuda_libdir:
-    prepend_library_path('LD_LIBRARY_PATH', config.cuda_libdir, ":")
-
-# Forbid fallback to host.
-config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
-
-# substitutions
-config.substitutions.append(("%compilexx-run-and-check",
-    "%compilexx-and-run | " + config.libomptarget_filecheck + " %s"))
-config.substitutions.append(("%compile-run-and-check",
-    "%compile-and-run | " + config.libomptarget_filecheck + " %s"))
-config.substitutions.append(("%compilexx-and-run", "%compilexx && %run"))
-config.substitutions.append(("%compile-and-run", "%compile && %run"))
-
-config.substitutions.append(("%compilexx",
-    "%clangxx %openmp_flags %cuda_flags %flags %s -o %t"))
-config.substitutions.append(("%compile",
-    "%clang %openmp_flags %cuda_flags %flags %s -o %t"))
-
-config.substitutions.append(("%clangxx", config.test_cxx_compiler))
-config.substitutions.append(("%clang", config.test_c_compiler))
-config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
-if config.cuda_path:
-  config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
-else:
-  config.substitutions.append(("%cuda_flags", ""))
-config.substitutions.append(("%flags", config.test_flags))
-
-config.substitutions.append(("%run", "%t"))
-config.substitutions.append(("%not", config.libomptarget_not))
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
@ -1,17 +0,0 @@
-@AUTO_GEN_COMMENT@
-
-config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
-config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
-config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@"
-config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@"
-config.cuda_path = "@CUDA_TOOLKIT_ROOT_DIR@"
-config.cuda_libdir = "@CUDA_LIBDIR@"
-config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@"
-config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
-config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
-config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
-config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
-config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
@ -1,37 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int data, out, flag = 0;
-#pragma omp target teams num_teams(2) map(tofrom                               \
-                                          : out) map(to                        \
-                                                     : data, flag)             \
-    thread_limit(1)
-#pragma omp parallel num_threads(1)
-  {
-    if (omp_get_team_num() == 0) {
-      /* Write to the data buffer that will be read by thread in team 1 */
-      data = 42;
-/* Flush data to thread in team 1 */
-#pragma omp barrier
-      /* Set flag to release thread in team 1 */
-#pragma omp atomic write
-      flag = 1;
-    } else if (omp_get_team_num() == 1) {
-      /* Loop until we see the update to the flag */
-      int val;
-      do {
-#pragma omp atomic read
-        val = flag;
-      } while (val < 1);
-      out = data;
-#pragma omp barrier
-    }
-  }
-  // CHECK: out=42.
-  /* Value of out will be 42 */
-  printf("out=%d.\n", out);
-  return !(out == 42);
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
@ -1,35 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int data, out, flag = 0;
-#pragma omp target parallel num_threads(64) map(tofrom                         \
-                                                : out, flag) map(to            \
-                                                                 : data)
-  {
-    if (omp_get_thread_num() == 0) {
-      /* Write to the data buffer that will be read by thread */
-      data = 42;
-/* Flush data to thread 32 */
-#pragma omp flush(data)
-      /* Set flag to release thread 32 */
-#pragma omp atomic write
-      flag = 1;
-    } else if (omp_get_thread_num() == 32) {
-      /* Loop until we see the update to the flag */
-      int val;
-      do {
-#pragma omp atomic read
-        val = flag;
-      } while (val < 1);
-      out = data;
-#pragma omp flush(out)
-    }
-  }
-  // CHECK: out=42.
-  /* Value of out will be 42 */
-  printf("out=%d.\n", out);
-  return !(out == 42);
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
@ -1,151 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-const int NumThreads = 64;
-
-int main(int argc, char *argv[]) {
-  int level = -1, activeLevel = -1;
-  // The expected value is -1, initialize to different value.
-  int ancestorTNumNeg = 1, teamSizeNeg = 1;
-  int ancestorTNum0 = -1, teamSize0 = -1;
-  // The expected value is -1, initialize to different value.
-  int ancestorTNum1 = 1, teamSize1 = 1;
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  int check3[MaxThreads];
-  int check4[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = check3[i] = check4[i] = 0;
-  }
-
-  #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \
-                     map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \
-                     map(check1[:], check2[:], check3[:], check4[:])
-  {
-    level = omp_get_level();
-    activeLevel = omp_get_active_level();
-
-    // Expected to return -1.
-    ancestorTNumNeg = omp_get_ancestor_thread_num(-1);
-    teamSizeNeg = omp_get_team_size(-1);
-
-    // Expected to return 0 and 1.
-    ancestorTNum0 = omp_get_ancestor_thread_num(0);
-    teamSize0 = omp_get_team_size(0);
-
-    // Expected to return -1 because the requested level is larger than
-    // the nest level.
-    ancestorTNum1 = omp_get_ancestor_thread_num(1);
-    teamSize1 = omp_get_team_size(1);
-
-    // Expecting active parallel region.
-    #pragma omp parallel num_threads(NumThreads)
-    {
-      int id = omp_get_thread_num();
-      // Multiply return value of omp_get_level by 5 to avoid that this test
-      // passes if both API calls return wrong values.
-      check1[id] += omp_get_level() * 5 + omp_get_active_level();
-
-      // Expected to return 0 and 1.
-      check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
-      // Expected to return the current thread num.
-      check2[id] += (omp_get_ancestor_thread_num(1) - id);
-      // Expected to return the current number of threads.
-      check2[id] += 3 * omp_get_team_size(1);
-      // Expected to return -1, see above.
-      check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
-
-      // Expecting serialized parallel region.
-      #pragma omp parallel
-      {
-        #pragma omp atomic
-        check3[id] += omp_get_level() * 5 + omp_get_active_level();
-
-        // Expected to return 0 and 1.
-        int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
-        // Expected to return the parent thread num.
-        check4Inc += (omp_get_ancestor_thread_num(1) - id);
-        // Expected to return the number of threads in the active parallel region.
-        check4Inc += 3 * omp_get_team_size(1);
-        // Expected to return 0 and 1.
-        check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
-        // Expected to return -1, see above.
-        check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
-
-        #pragma omp atomic
-        check4[id] += check4Inc;
-      }
-    }
-  }
-
-  // CHECK: target: level = 0, activeLevel = 0
-  printf("target: level = %d, activeLevel = %d\n", level, activeLevel);
-  // CHECK: level = -1: ancestorTNum = -1, teamSize = -1
-  printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg);
-  // CHECK: level = 0: ancestorTNum = 0, teamSize = 1
-  printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0);
-  // CHECK: level = 1: ancestorTNum = -1, teamSize = -1
-  printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    // Check active parallel region:
-    // omp_get_level() = 1, omp_get_active_level() = 1
-    const int Expected1 = 6;
-    if (i < NumThreads) {
-      if (check1[i] != Expected1) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    // 5 * 1 + 3 * 64 - 1 - 1 (see above)
-    const int Expected2 = 195;
-    if (i < NumThreads) {
-      if (check2[i] != Expected2) {
-        printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-
-    // Check serialized parallel region:
-    // omp_get_level() = 2, omp_get_active_level() = 1
-    const int Expected3 = 11;
-    if (i < NumThreads) {
-      if (check3[i] != Expected3) {
-        printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]);
-      }
-    } else if (check3[i] != 0) {
-      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
-    }
-
-    // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above)
-    const int Expected4 = 198;
-    if (i < NumThreads) {
-      if (check4[i] != Expected4) {
-        printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]);
-      }
-    } else if (check4[i] != 0) {
-      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
-    }
-  }
-
-  // Check for paraller level in non-SPMD kernels.
-  level = 0;
-  #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
-  for (int i=0; i<5032; i+=32) {
-    int ub = (i+32 > 5032) ? 5032 : i+32;
-    #pragma omp parallel for schedule(dynamic)
-    for (int j=i ; j < ub; j++) ;
-    level += omp_get_level();
-  }
-  // CHECK: Integral level = 0.
-  printf("Integral level = %d.\n", level);
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
@ -1,136 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-const int NumThreads = 64;
-const int NumThreads1 = 1;
-
-int main(int argc, char *argv[]) {
-  int inParallel = -1, numThreads = -1, threadNum = -1;
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = 0;
-  }
-
-#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
-  {
-    inParallel = omp_in_parallel();
-    numThreads = omp_get_num_threads();
-    threadNum = omp_get_thread_num();
-
-// Expecting active parallel region.
-#pragma omp parallel num_threads(NumThreads)
-    {
-      int id = omp_get_thread_num();
-      check1[id] += omp_get_num_threads() + omp_in_parallel();
-
-// Expecting serialized parallel region.
-#pragma omp parallel
-      {
-        // Expected to be 1.
-        int nestedInParallel = omp_in_parallel();
-        // Expected to be 1.
-        int nestedNumThreads = omp_get_num_threads();
-        // Expected to be 0.
-        int nestedThreadNum = omp_get_thread_num();
-#pragma omp atomic
-        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
-      }
-    }
-  }
-
-  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
-  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
-         inParallel, numThreads, threadNum);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    // Check that all threads reported
-    // omp_get_num_threads() = 64, omp_in_parallel() = 1.
-    int Expected = NumThreads + 1;
-    if (i < NumThreads) {
-      if (check1[i] != Expected) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
-               check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    // Check serialized parallel region.
-    if (i < NumThreads) {
-      if (check2[i] != 2) {
-        printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-  }
-
-  inParallel = -1;
-  numThreads = -1;
-  threadNum = -1;
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = 0;
-  }
-
-#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
-  {
-    inParallel = omp_in_parallel();
-    numThreads = omp_get_num_threads();
-    threadNum = omp_get_thread_num();
-
-// Expecting active parallel region.
-#pragma omp parallel num_threads(NumThreads1)
-    {
-      int id = omp_get_thread_num();
-      check1[id] += omp_get_num_threads() + omp_in_parallel();
-
-// Expecting serialized parallel region.
-#pragma omp parallel
-      {
-        // Expected to be 0.
-        int nestedInParallel = omp_in_parallel();
-        // Expected to be 1.
-        int nestedNumThreads = omp_get_num_threads();
-        // Expected to be 0.
-        int nestedThreadNum = omp_get_thread_num();
-#pragma omp atomic
-        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
-      }
-    }
-  }
-
-  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
-  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
-         inParallel, numThreads, threadNum);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    // Check that all threads reported
-    // omp_get_num_threads() = 1, omp_in_parallel() = 0.
-    int Expected = 1;
-    if (i < NumThreads1) {
-      if (check1[i] != Expected) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
-               check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    // Check serialized parallel region.
-    if (i < NumThreads1) {
-      if (check2[i] != 1) {
-        printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-  }
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
@ -1,102 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-const int WarpSize = 32;
-const int NumThreads1 = 1 * WarpSize;
-const int NumThreads2 = 2 * WarpSize;
-const int NumThreads3 = 3 * WarpSize;
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  int check3[MaxThreads];
-  int check4[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = check3[i] = check4[i] = 0;
-  }
-
-  int maxThreads1 = -1;
-  int maxThreads2 = -1;
-  int maxThreads3 = -1;
-
-  #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
-                     map(maxThreads1, maxThreads2, maxThreads3)
-  {
-    #pragma omp parallel num_threads(NumThreads1)
-    {
-      check1[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    // API method to set number of threads in parallel regions without
-    // num_threads() clause.
-    omp_set_num_threads(NumThreads2);
-    maxThreads1 = omp_get_max_threads();
-    #pragma omp parallel
-    {
-      check2[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    maxThreads2 = omp_get_max_threads();
-
-    // num_threads() clause should override nthreads-var ICV.
-    #pragma omp parallel num_threads(NumThreads3)
-    {
-      check3[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    maxThreads3 = omp_get_max_threads();
-
-    // Effect from omp_set_num_threads() should still be visible.
-    #pragma omp parallel
-    {
-      check4[omp_get_thread_num()] += omp_get_num_threads();
-    }
-  }
-
-  // CHECK: maxThreads1 = 64
-  printf("maxThreads1 = %d\n", maxThreads1);
-  // CHECK: maxThreads2 = 64
-  printf("maxThreads2 = %d\n", maxThreads2);
-  // CHECK: maxThreads3 = 64
-  printf("maxThreads3 = %d\n", maxThreads3);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    if (i < NumThreads1) {
-      if (check1[i] != NumThreads1) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    if (i < NumThreads2) {
-      if (check2[i] != NumThreads2) {
-        printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-
-    if (i < NumThreads3) {
-      if (check3[i] != NumThreads3) {
-        printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
-      }
-    } else if (check3[i] != 0) {
-      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
-    }
-
-    if (i < NumThreads2) {
-      if (check4[i] != NumThreads2) {
-        printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
-      }
-    } else if (check4[i] != 0) {
-      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
-    }
-  }
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
@ -1,51 +0,0 @@
-// RUN: %compilexx-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-int main(void) {
-  int isHost = -1;
-  int ParallelLevel1 = -1, ParallelLevel2 = -1;
-  int Count = 0;
-
-#pragma omp target parallel for map(tofrom                                     \
-                                    : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
-  for (int J = 0; J < 10; ++J) {
-#pragma omp critical
-    {
-      isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
-      ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
-                           ? omp_get_level()
-                           : ParallelLevel1;
-    }
-    if (omp_get_thread_num() > 5) {
-      int L2;
-#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
-      for (int I = 0; I < 10; ++I) {
-        L2 = omp_get_level();
-        Count += omp_get_level(); // (10-6)*10*2 = 80
-      }
-#pragma omp critical
-      ParallelLevel2 =
-          (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
-    } else {
-      Count += omp_get_level(); // 6 * 1 = 6
-    }
-  }
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-  // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
-  printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
-         ParallelLevel2);
-  // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
-  // level) + 6(num of iterations) * 1(par level)
-  // CHECK: Expected count = 86
-  printf("Expected count = %d\n", Count);
-
-  return isHost;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
@ -1,77 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-const int WarpSize = 32;
-const int ThreadLimit = 1 * WarpSize;
-const int NumThreads2 = 2 * WarpSize;
-const int NumThreads3 = 3 * WarpSize;
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  int check3[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = check3[i] = 0;
-  }
-
-  int threadLimit = -1;
-
-  #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \
-                           map(check1[:], check2[:], check3[:], threadLimit)
-  {
-    threadLimit = omp_get_thread_limit();
-
-    // All parallel regions should get as many threads as specified by the
-    // thread_limit() clause.
-    #pragma omp parallel
-    {
-      check1[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    omp_set_num_threads(NumThreads2);
-    #pragma omp parallel
-    {
-      check2[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    #pragma omp parallel num_threads(NumThreads3)
-    {
-      check3[omp_get_thread_num()] += omp_get_num_threads();
-    }
-  }
-
-  // CHECK: threadLimit = 32
-  printf("threadLimit = %d\n", threadLimit);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    if (i < ThreadLimit) {
-      if (check1[i] != ThreadLimit) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    if (i < ThreadLimit) {
-      if (check2[i] != ThreadLimit) {
-        printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-
-    if (i < ThreadLimit) {
-      if (check3[i] != ThreadLimit) {
-        printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]);
-      }
-    } else if (check3[i] != 0) {
-      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
-    }
-  }
-
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
@ -1,22 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  int res = 0;
-
-#pragma omp parallel num_threads(2) reduction(+:res)
-  {
-    int tid = omp_get_thread_num();
-#pragma omp target teams distribute reduction(+:res)
-    for (int i = tid; i < 2; i++)
-      ++res;
-  }
-  // The first thread makes 2 iterations, the second - 1. Expected result of the
-  // reduction res is 3.
-
-  // CHECK: res = 3.
-  printf("res = %d.\n", res);
-  return 0;
-}
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@ -1,78 +0,0 @@
-//===------------- target_interface.h - Target interfaces --------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains interfaces that must be implemented by each target.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_TARGET_INTERFACE_H_
-#define _OMPTARGET_TARGET_INTERFACE_H_
-
-#include <stdint.h>
-
-#include "DeviceEnvironment.h"
-#include "target_impl.h"
-
-// Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int __kmpc_get_hardware_thread_id_in_block();
-EXTERN int GetBlockIdInKernel();
-EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
-EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
-EXTERN unsigned __kmpc_get_warp_size();
-EXTERN unsigned GetWarpId();
-EXTERN unsigned GetLaneId();
-
-// Atomics
-uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
-static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-unsigned long long __kmpc_atomic_exchange(unsigned long long *,
-                                          unsigned long long);
-unsigned long long __kmpc_atomic_add(unsigned long long *, unsigned long long);
-
-// Locks
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
-
-EXTERN void __kmpc_impl_threadfence();
-EXTERN void __kmpc_impl_threadfence_block();
-EXTERN void __kmpc_impl_threadfence_system();
-
-EXTERN double __kmpc_impl_get_wtick();
-EXTERN double __kmpc_impl_get_wtime();
-
-EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
-EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
-EXTERN uint32_t __kmpc_impl_smid();
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
-
-EXTERN void __kmpc_impl_syncthreads();
-EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
-
-// Kernel initialization
-EXTERN void __kmpc_impl_target_init();
-
-// Memory
-EXTERN void *__kmpc_impl_malloc(size_t);
-EXTERN void __kmpc_impl_free(void *);
-
-// Barrier until num_threads arrive.
-EXTERN void __kmpc_impl_named_sync(uint32_t num_threads);
-
-extern DeviceEnvironmentTy omptarget_device_environment;
-
-#endif // _OMPTARGET_TARGET_INTERFACE_H_
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
@ -118,6 +118,6 @@ if (${amdgpu_arch_result})
  libomptarget_say("Not generating amdgcn test targets as amdgpu-arch exited with ${amdgpu_arch_result}")
 else()
  # Report to the parent scope that we are building a plugin for amdgpu
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL " PARENT_SCOPE)
+  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa " PARENT_SCOPE)
 endif()

--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@ -72,7 +72,7 @@ target_link_libraries(omptarget.rtl.cuda
 # Otherwise this plugin is being built speculatively and there may be no cuda available
 if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
  libomptarget_say("Enable tests using CUDA plugin")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-newRTL nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
+  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
 else()
  libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available")
 endif()
--- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@ -1,4 +1,4 @@
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda
 // RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
 // RUN:   %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
 // REQUIRES: nvptx64-nvidia-cuda
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@ -104,17 +104,11 @@ else: # Unices
        config.test_flags += " --libomptarget-amdgcn-bc-path=" + config.library_dir
    if config.libomptarget_current_target.startswith('nvptx'):
        config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
-    if config.libomptarget_current_target.endswith('-newRTL'):
-        config.test_flags += " -fopenmp-target-new-runtime"
-    elif not config.libomptarget_current_target.endswith('-newDriver'):
-        config.test_flags += " -fno-openmp-target-new-runtime"
    if config.libomptarget_current_target.endswith('-newDriver'):
        config.test_flags += " -fopenmp-new-driver"

-def remove_newRTL_suffix_if_present(name):
-    if name.endswith('-newRTL'):
-        return name[:-7]
-    elif name.endswith('-newDriver'):
+def remove_suffix_if_present(name):
+    if name.endswith('-newDriver'):
        return name[:-10]
    else:
        return name
@ -183,10 +177,10 @@ for libomptarget_target in config.libomptarget_all_targets:
            "%not --crash %t"))
        config.substitutions.append(("%clangxx-" + libomptarget_target, \
                                     "%clangxx %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
-                                     remove_newRTL_suffix_if_present(libomptarget_target)))
+                                     remove_suffix_if_present(libomptarget_target)))
        config.substitutions.append(("%clang-" + libomptarget_target, \
                                     "%clang %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
-                                     remove_newRTL_suffix_if_present(libomptarget_target)))
+                                     remove_suffix_if_present(libomptarget_target)))
        config.substitutions.append(("%fcheck-" + libomptarget_target, \
            config.libomptarget_filecheck + " %s"))
    else:
--- a/openmp/libomptarget/test/mapping/data_member_ref.cpp
+++ b/openmp/libomptarget/test/mapping/data_member_ref.cpp
@ -2,7 +2,6 @@

 // Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <stdio.h>

--- a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
@ -2,7 +2,6 @@

 // Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <cstdio>
 #include <cstdlib>
--- a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
+++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
@ -2,7 +2,6 @@

 // Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <cstdio>
 #include <cstdlib>
--- a/openmp/libomptarget/test/mapping/lambda_by_value.cpp
+++ b/openmp/libomptarget/test/mapping/lambda_by_value.cpp
@ -2,7 +2,6 @@

 // Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <stdio.h>
 #include <stdint.h>
--- a/openmp/libomptarget/test/mapping/lambda_mapping.cpp
+++ b/openmp/libomptarget/test/mapping/lambda_mapping.cpp
@ -1,7 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic

 // Error on the gpu that crashes the host
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
+// UNSUPPORTED: amdgcn-amd-amdhsa

 #include <iostream>

--- a/openmp/libomptarget/test/mapping/ompx_hold/struct.c
+++ b/openmp/libomptarget/test/mapping/ompx_hold/struct.c
@ -3,7 +3,6 @@

 // Wrong results on amdgpu
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <omp.h>
 #include <stdio.h>
--- a/openmp/libomptarget/test/offloading/bug49021.cpp
+++ b/openmp/libomptarget/test/offloading/bug49021.cpp
@ -2,7 +2,6 @@

 // Hangs
 // UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 // UNSUPPORTED: amdgcn-amd-amdhsa-newDriver

 #include <iostream>
--- a/openmp/libomptarget/test/offloading/bug49334.cpp
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@ -2,7 +2,6 @@

 // Currently hangs on amdgpu
 // UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 // UNSUPPORTED: x86_64-pc-linux-gnu

 #include <cassert>
--- a/openmp/libomptarget/test/offloading/bug51781.c
+++ b/openmp/libomptarget/test/offloading/bug51781.c
@ -34,7 +34,6 @@

 // Hangs
 // UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
 // UNSUPPORTED: amdgcn-amd-amdhsa-newDriver

 #if ADD_REDUCTION
--- a/openmp/libomptarget/test/offloading/global_constructor.cpp
+++ b/openmp/libomptarget/test/offloading/global_constructor.cpp
@ -2,7 +2,6 @@

 // Fails in DAGToDAG on an address space problem
 // UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL

 #include <cmath>
 #include <cstdio>
--- a/openmp/libomptarget/test/offloading/host_as_target.c
+++ b/openmp/libomptarget/test/offloading/host_as_target.c
@ -9,7 +9,6 @@

 // amdgpu does not have a working printf definition
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <stdio.h>
 #include <omp.h>
--- a/openmp/libomptarget/test/unified_shared_memory/api.c
+++ b/openmp/libomptarget/test/unified_shared_memory/api.c
@ -1,11 +1,10 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // XFAIL: nvptx64-nvidia-cuda
-// XFAIL: nvptx64-nvidia-cuda-newRTL
+// XFAIL: nvptx64-nvidia-cuda
 // XFAIL: nvptx64-nvidia-cuda-newDriver

 // Fails on amdgpu with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
 // XFAIL: amdgcn-amd-amdhsa-newDriver

 #include <stdio.h>
--- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
@ -5,7 +5,6 @@

 // Fails on amdgpu with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <omp.h>
 #include <stdio.h>
--- a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
@ -5,7 +5,6 @@

 // amdgpu runtime crash
 // UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL


 #include <omp.h>
--- a/openmp/libomptarget/test/unified_shared_memory/shared_update.c
+++ b/openmp/libomptarget/test/unified_shared_memory/shared_update.c
@ -4,7 +4,6 @@

 // amdgpu runtime crash
 // UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL

 #include <stdio.h>
 #include <omp.h>