[OpenCL] Mark group functions as convergent in opencl-c.h

Certain OpenCL builtin functions are supposed to be executed by all threads in a work group or sub group. Such functions should not be made divergent during transformation. It makes sense to mark them with convergent attribute. The adding of convergent attribute is based on Ettore Speziale's work and the original proposal and patch can be found at https://www.mail-archive.com/cfe-commits@lists.llvm.org/msg22271.html. Differential Revision: https://reviews.llvm.org/D25343 llvm-svn: 285725
2016-11-01 18:45:32 +00:00 · 2016-11-01 18:45:32 +00:00 · 7d07ae7c85
parent cb578f84e0
commit 7d07ae7c85
7 changed files with 332 additions and 163 deletions
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@ -1026,6 +1026,12 @@ def NoDuplicate : InheritableAttr {
  let Documentation = [NoDuplicateDocs];
 }

+def Convergent : InheritableAttr {
+  let Spellings = [GNU<"convergent">, CXX11<"clang", "convergent">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [ConvergentDocs];
+}
+
 def NoInline : InheritableAttr {
  let Spellings = [GCC<"noinline">, Declspec<"noinline">];
  let Subjects = SubjectList<[Function]>;
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@ -606,6 +606,33 @@ of the condition.
  }];
 }

+def ConvergentDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``convergent`` attribute can be placed on a function declaration. It is
+translated into the LLVM ``convergent`` attribute, which indicates that the call
+instructions of a function with this attribute cannot be made control-dependent
+on any additional values.
+
+In languages designed for SPMD/SIMT programming model, e.g. OpenCL or CUDA,
+the call instructions of a function with this attribute must be executed by
+all work items or threads in a work group or sub group.
+
+This attribute is different from ``noduplicate`` because it allows duplicating
+function calls if it can be proved that the duplicated function calls are
+not made control-dependent on any additional values, e.g., unrolling a loop
+executed by all work items.
+
+Sample usage:
+.. code-block:: c
+
+  void convfunc(void) __attribute__((convergent));
+  // Setting it as a C++11 attribute is also valid in a C++ program.
+  // void convfunc(void) [[clang::convergent]];
+
+  }];
+}
+
 def NoSplitStackDocs : Documentation {
  let Category = DocCatFunction;
  let Content = [{
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@ -1648,6 +1648,8 @@ void CodeGenModule::ConstructAttributeList(
      FuncAttrs.addAttribute(llvm::Attribute::NoReturn);
    if (TargetDecl->hasAttr<NoDuplicateAttr>())
      FuncAttrs.addAttribute(llvm::Attribute::NoDuplicate);
+    if (TargetDecl->hasAttr<ConvergentAttr>())
+      FuncAttrs.addAttribute(llvm::Attribute::Convergent);

    if (const FunctionDecl *Fn = dyn_cast<FunctionDecl>(TargetDecl)) {
      AddAttributesFromFunctionProtoType(
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@ -17,6 +17,7 @@
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0

 #define __ovld __attribute__((overloadable))
+#define __conv __attribute__((convergent))

 // Optimizations
 #define __purefn __attribute__((pure))
@ -13822,7 +13823,7 @@ typedef uint cl_mem_fence_flags;
 * image objects and then want to read the updated data.
 */

-void __ovld barrier(cl_mem_fence_flags flags);
+void __ovld __conv barrier(cl_mem_fence_flags flags);

 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0

@ -13835,8 +13836,8 @@ typedef enum memory_scope
  memory_scope_sub_group
 } memory_scope;

-void __ovld work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
-void __ovld work_group_barrier(cl_mem_fence_flags flags);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0

 // OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
@ -16568,101 +16569,101 @@ int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
 // OpenCL v2.0 s6.13.15 - Work-group Functions

 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-int __ovld work_group_all(int predicate);
-int __ovld work_group_any(int predicate);
+int __ovld __conv work_group_all(int predicate);
+int __ovld __conv work_group_any(int predicate);

 #ifdef cl_khr_fp16
-half __ovld work_group_broadcast(half a, size_t local_id);
-half __ovld work_group_broadcast(half a, size_t x, size_t y);
-half __ovld work_group_broadcast(half a, size_t x, size_t y, size_t z);
+half __ovld __conv work_group_broadcast(half a, size_t local_id);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z);
 #endif
-int __ovld work_group_broadcast(int a, size_t local_id);
-int __ovld work_group_broadcast(int a, size_t x, size_t y);
-int __ovld work_group_broadcast(int a, size_t x, size_t y, size_t z);
-uint __ovld work_group_broadcast(uint a, size_t local_id);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y, size_t z);
-long __ovld work_group_broadcast(long a, size_t local_id);
-long __ovld work_group_broadcast(long a, size_t x, size_t y);
-long __ovld work_group_broadcast(long a, size_t x, size_t y, size_t z);
-ulong __ovld work_group_broadcast(ulong a, size_t local_id);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
-float __ovld work_group_broadcast(float a, size_t local_id);
-float __ovld work_group_broadcast(float a, size_t x, size_t y);
-float __ovld work_group_broadcast(float a, size_t x, size_t y, size_t z);
+int __ovld __conv work_group_broadcast(int a, size_t local_id);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z);
+uint __ovld __conv work_group_broadcast(uint a, size_t local_id);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z);
+long __ovld __conv work_group_broadcast(long a, size_t local_id);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
+float __ovld __conv work_group_broadcast(float a, size_t local_id);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z);
 #ifdef cl_khr_fp64
-double __ovld work_group_broadcast(double a, size_t local_id);
-double __ovld work_group_broadcast(double a, size_t x, size_t y);
-double __ovld work_group_broadcast(double a, size_t x, size_t y, size_t z);
+double __ovld __conv work_group_broadcast(double a, size_t local_id);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z);
 #endif //cl_khr_fp64

 #ifdef cl_khr_fp16
-half __ovld work_group_reduce_add(half x);
-half __ovld work_group_reduce_min(half x);
-half __ovld work_group_reduce_max(half x);
-half __ovld work_group_scan_exclusive_add(half x);
-half __ovld work_group_scan_exclusive_min(half x);
-half __ovld work_group_scan_exclusive_max(half x);
-half __ovld work_group_scan_inclusive_add(half x);
-half __ovld work_group_scan_inclusive_min(half x);
-half __ovld work_group_scan_inclusive_max(half x);
+half __ovld __conv work_group_reduce_add(half x);
+half __ovld __conv work_group_reduce_min(half x);
+half __ovld __conv work_group_reduce_max(half x);
+half __ovld __conv work_group_scan_exclusive_add(half x);
+half __ovld __conv work_group_scan_exclusive_min(half x);
+half __ovld __conv work_group_scan_exclusive_max(half x);
+half __ovld __conv work_group_scan_inclusive_add(half x);
+half __ovld __conv work_group_scan_inclusive_min(half x);
+half __ovld __conv work_group_scan_inclusive_max(half x);
 #endif
-int __ovld work_group_reduce_add(int x);
-int __ovld work_group_reduce_min(int x);
-int __ovld work_group_reduce_max(int x);
-int __ovld work_group_scan_exclusive_add(int x);
-int __ovld work_group_scan_exclusive_min(int x);
-int __ovld work_group_scan_exclusive_max(int x);
-int __ovld work_group_scan_inclusive_add(int x);
-int __ovld work_group_scan_inclusive_min(int x);
-int __ovld work_group_scan_inclusive_max(int x);
-uint __ovld work_group_reduce_add(uint x);
-uint __ovld work_group_reduce_min(uint x);
-uint __ovld work_group_reduce_max(uint x);
-uint __ovld work_group_scan_exclusive_add(uint x);
-uint __ovld work_group_scan_exclusive_min(uint x);
-uint __ovld work_group_scan_exclusive_max(uint x);
-uint __ovld work_group_scan_inclusive_add(uint x);
-uint __ovld work_group_scan_inclusive_min(uint x);
-uint __ovld work_group_scan_inclusive_max(uint x);
-long __ovld work_group_reduce_add(long x);
-long __ovld work_group_reduce_min(long x);
-long __ovld work_group_reduce_max(long x);
-long __ovld work_group_scan_exclusive_add(long x);
-long __ovld work_group_scan_exclusive_min(long x);
-long __ovld work_group_scan_exclusive_max(long x);
-long __ovld work_group_scan_inclusive_add(long x);
-long __ovld work_group_scan_inclusive_min(long x);
-long __ovld work_group_scan_inclusive_max(long x);
-ulong __ovld work_group_reduce_add(ulong x);
-ulong __ovld work_group_reduce_min(ulong x);
-ulong __ovld work_group_reduce_max(ulong x);
-ulong __ovld work_group_scan_exclusive_add(ulong x);
-ulong __ovld work_group_scan_exclusive_min(ulong x);
-ulong __ovld work_group_scan_exclusive_max(ulong x);
-ulong __ovld work_group_scan_inclusive_add(ulong x);
-ulong __ovld work_group_scan_inclusive_min(ulong x);
-ulong __ovld work_group_scan_inclusive_max(ulong x);
-float __ovld work_group_reduce_add(float x);
-float __ovld work_group_reduce_min(float x);
-float __ovld work_group_reduce_max(float x);
-float __ovld work_group_scan_exclusive_add(float x);
-float __ovld work_group_scan_exclusive_min(float x);
-float __ovld work_group_scan_exclusive_max(float x);
-float __ovld work_group_scan_inclusive_add(float x);
-float __ovld work_group_scan_inclusive_min(float x);
-float __ovld work_group_scan_inclusive_max(float x);
+int __ovld __conv work_group_reduce_add(int x);
+int __ovld __conv work_group_reduce_min(int x);
+int __ovld __conv work_group_reduce_max(int x);
+int __ovld __conv work_group_scan_exclusive_add(int x);
+int __ovld __conv work_group_scan_exclusive_min(int x);
+int __ovld __conv work_group_scan_exclusive_max(int x);
+int __ovld __conv work_group_scan_inclusive_add(int x);
+int __ovld __conv work_group_scan_inclusive_min(int x);
+int __ovld __conv work_group_scan_inclusive_max(int x);
+uint __ovld __conv work_group_reduce_add(uint x);
+uint __ovld __conv work_group_reduce_min(uint x);
+uint __ovld __conv work_group_reduce_max(uint x);
+uint __ovld __conv work_group_scan_exclusive_add(uint x);
+uint __ovld __conv work_group_scan_exclusive_min(uint x);
+uint __ovld __conv work_group_scan_exclusive_max(uint x);
+uint __ovld __conv work_group_scan_inclusive_add(uint x);
+uint __ovld __conv work_group_scan_inclusive_min(uint x);
+uint __ovld __conv work_group_scan_inclusive_max(uint x);
+long __ovld __conv work_group_reduce_add(long x);
+long __ovld __conv work_group_reduce_min(long x);
+long __ovld __conv work_group_reduce_max(long x);
+long __ovld __conv work_group_scan_exclusive_add(long x);
+long __ovld __conv work_group_scan_exclusive_min(long x);
+long __ovld __conv work_group_scan_exclusive_max(long x);
+long __ovld __conv work_group_scan_inclusive_add(long x);
+long __ovld __conv work_group_scan_inclusive_min(long x);
+long __ovld __conv work_group_scan_inclusive_max(long x);
+ulong __ovld __conv work_group_reduce_add(ulong x);
+ulong __ovld __conv work_group_reduce_min(ulong x);
+ulong __ovld __conv work_group_reduce_max(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_max(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_max(ulong x);
+float __ovld __conv work_group_reduce_add(float x);
+float __ovld __conv work_group_reduce_min(float x);
+float __ovld __conv work_group_reduce_max(float x);
+float __ovld __conv work_group_scan_exclusive_add(float x);
+float __ovld __conv work_group_scan_exclusive_min(float x);
+float __ovld __conv work_group_scan_exclusive_max(float x);
+float __ovld __conv work_group_scan_inclusive_add(float x);
+float __ovld __conv work_group_scan_inclusive_min(float x);
+float __ovld __conv work_group_scan_inclusive_max(float x);
 #ifdef cl_khr_fp64
-double __ovld work_group_reduce_add(double x);
-double __ovld work_group_reduce_min(double x);
-double __ovld work_group_reduce_max(double x);
-double __ovld work_group_scan_exclusive_add(double x);
-double __ovld work_group_scan_exclusive_min(double x);
-double __ovld work_group_scan_exclusive_max(double x);
-double __ovld work_group_scan_inclusive_add(double x);
-double __ovld work_group_scan_inclusive_min(double x);
-double __ovld work_group_scan_inclusive_max(double x);
+double __ovld __conv work_group_reduce_add(double x);
+double __ovld __conv work_group_reduce_min(double x);
+double __ovld __conv work_group_reduce_max(double x);
+double __ovld __conv work_group_scan_exclusive_add(double x);
+double __ovld __conv work_group_scan_exclusive_min(double x);
+double __ovld __conv work_group_scan_exclusive_max(double x);
+double __ovld __conv work_group_scan_inclusive_add(double x);
+double __ovld __conv work_group_scan_inclusive_min(double x);
+double __ovld __conv work_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64

 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@ -16762,92 +16763,92 @@ uint    __ovld get_enqueued_num_sub_groups(void);
 uint    __ovld get_sub_group_id(void);
 uint    __ovld get_sub_group_local_id(void);

-void    __ovld sub_group_barrier(cl_mem_fence_flags flags);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-void    __ovld sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0

-int     __ovld sub_group_all(int predicate);
-int     __ovld sub_group_any(int predicate);
+int     __ovld __conv sub_group_all(int predicate);
+int     __ovld __conv sub_group_any(int predicate);

-int     __ovld sub_group_broadcast(int   x, uint sub_group_local_id);
-uint    __ovld sub_group_broadcast(uint  x, uint sub_group_local_id);
-long    __ovld sub_group_broadcast(long  x, uint sub_group_local_id);
-ulong   __ovld sub_group_broadcast(ulong x, uint sub_group_local_id);
-float   __ovld sub_group_broadcast(float x, uint sub_group_local_id);
+int     __ovld __conv sub_group_broadcast(int   x, uint sub_group_local_id);
+uint    __ovld __conv sub_group_broadcast(uint  x, uint sub_group_local_id);
+long    __ovld __conv sub_group_broadcast(long  x, uint sub_group_local_id);
+ulong   __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id);
+float   __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id);

-int     __ovld sub_group_reduce_add(int   x);
-uint    __ovld sub_group_reduce_add(uint  x);
-long    __ovld sub_group_reduce_add(long  x);
-ulong   __ovld sub_group_reduce_add(ulong x);
-float   __ovld sub_group_reduce_add(float x);
-int     __ovld sub_group_reduce_min(int   x);
-uint    __ovld sub_group_reduce_min(uint  x);
-long    __ovld sub_group_reduce_min(long  x);
-ulong   __ovld sub_group_reduce_min(ulong x);
-float   __ovld sub_group_reduce_min(float x);
-int     __ovld sub_group_reduce_max(int   x);
-uint    __ovld sub_group_reduce_max(uint  x);
-long    __ovld sub_group_reduce_max(long  x);
-ulong   __ovld sub_group_reduce_max(ulong x);
-float   __ovld sub_group_reduce_max(float x);
+int     __ovld __conv sub_group_reduce_add(int   x);
+uint    __ovld __conv sub_group_reduce_add(uint  x);
+long    __ovld __conv sub_group_reduce_add(long  x);
+ulong   __ovld __conv sub_group_reduce_add(ulong x);
+float   __ovld __conv sub_group_reduce_add(float x);
+int     __ovld __conv sub_group_reduce_min(int   x);
+uint    __ovld __conv sub_group_reduce_min(uint  x);
+long    __ovld __conv sub_group_reduce_min(long  x);
+ulong   __ovld __conv sub_group_reduce_min(ulong x);
+float   __ovld __conv sub_group_reduce_min(float x);
+int     __ovld __conv sub_group_reduce_max(int   x);
+uint    __ovld __conv sub_group_reduce_max(uint  x);
+long    __ovld __conv sub_group_reduce_max(long  x);
+ulong   __ovld __conv sub_group_reduce_max(ulong x);
+float   __ovld __conv sub_group_reduce_max(float x);

-int     __ovld sub_group_scan_exclusive_add(int   x);
-uint    __ovld sub_group_scan_exclusive_add(uint  x);
-long    __ovld sub_group_scan_exclusive_add(long  x);
-ulong   __ovld sub_group_scan_exclusive_add(ulong x);
-float   __ovld sub_group_scan_exclusive_add(float x);
-int     __ovld sub_group_scan_exclusive_min(int   x);
-uint    __ovld sub_group_scan_exclusive_min(uint  x);
-long    __ovld sub_group_scan_exclusive_min(long  x);
-ulong   __ovld sub_group_scan_exclusive_min(ulong x);
-float   __ovld sub_group_scan_exclusive_min(float x);
-int     __ovld sub_group_scan_exclusive_max(int   x);
-uint    __ovld sub_group_scan_exclusive_max(uint  x);
-long    __ovld sub_group_scan_exclusive_max(long  x);
-ulong   __ovld sub_group_scan_exclusive_max(ulong x);
-float   __ovld sub_group_scan_exclusive_max(float x);
+int     __ovld __conv sub_group_scan_exclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_add(float x);
+int     __ovld __conv sub_group_scan_exclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_min(float x);
+int     __ovld __conv sub_group_scan_exclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_max(float x);

-int     __ovld sub_group_scan_inclusive_add(int   x);
-uint    __ovld sub_group_scan_inclusive_add(uint  x);
-long    __ovld sub_group_scan_inclusive_add(long  x);
-ulong   __ovld sub_group_scan_inclusive_add(ulong x);
-float   __ovld sub_group_scan_inclusive_add(float x);
-int     __ovld sub_group_scan_inclusive_min(int   x);
-uint    __ovld sub_group_scan_inclusive_min(uint  x);
-long    __ovld sub_group_scan_inclusive_min(long  x);
-ulong   __ovld sub_group_scan_inclusive_min(ulong x);
-float   __ovld sub_group_scan_inclusive_min(float x);
-int     __ovld sub_group_scan_inclusive_max(int   x);
-uint    __ovld sub_group_scan_inclusive_max(uint  x);
-long    __ovld sub_group_scan_inclusive_max(long  x);
-ulong   __ovld sub_group_scan_inclusive_max(ulong x);
-float   __ovld sub_group_scan_inclusive_max(float x);
+int     __ovld __conv sub_group_scan_inclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_add(float x);
+int     __ovld __conv sub_group_scan_inclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_min(float x);
+int     __ovld __conv sub_group_scan_inclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_max(float x);

 #ifdef cl_khr_fp16
-half    __ovld sub_group_broadcast(half x, uint sub_group_local_id);
-half    __ovld sub_group_reduce_add(half x);
-half    __ovld sub_group_reduce_min(half x);
-half    __ovld sub_group_reduce_max(half x);
-half    __ovld sub_group_scan_exclusive_add(half x);
-half    __ovld sub_group_scan_exclusive_min(half x);
-half    __ovld sub_group_scan_exclusive_max(half x);
-half    __ovld sub_group_scan_inclusive_add(half x);
-half    __ovld sub_group_scan_inclusive_min(half x);
-half    __ovld sub_group_scan_inclusive_max(half x);
+half    __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id);
+half    __ovld __conv sub_group_reduce_add(half x);
+half    __ovld __conv sub_group_reduce_min(half x);
+half    __ovld __conv sub_group_reduce_max(half x);
+half    __ovld __conv sub_group_scan_exclusive_add(half x);
+half    __ovld __conv sub_group_scan_exclusive_min(half x);
+half    __ovld __conv sub_group_scan_exclusive_max(half x);
+half    __ovld __conv sub_group_scan_inclusive_add(half x);
+half    __ovld __conv sub_group_scan_inclusive_min(half x);
+half    __ovld __conv sub_group_scan_inclusive_max(half x);
 #endif //cl_khr_fp16

 #ifdef cl_khr_fp64
-double  __ovld sub_group_broadcast(double x, uint sub_group_local_id);
-double  __ovld sub_group_reduce_add(double x);
-double  __ovld sub_group_reduce_min(double x);
-double  __ovld sub_group_reduce_max(double x);
-double  __ovld sub_group_scan_exclusive_add(double x);
-double  __ovld sub_group_scan_exclusive_min(double x);
-double  __ovld sub_group_scan_exclusive_max(double x);
-double  __ovld sub_group_scan_inclusive_add(double x);
-double  __ovld sub_group_scan_inclusive_min(double x);
-double  __ovld sub_group_scan_inclusive_max(double x);
+double  __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id);
+double  __ovld __conv sub_group_reduce_add(double x);
+double  __ovld __conv sub_group_reduce_min(double x);
+double  __ovld __conv sub_group_reduce_max(double x);
+double  __ovld __conv sub_group_scan_exclusive_add(double x);
+double  __ovld __conv sub_group_scan_exclusive_min(double x);
+double  __ovld __conv sub_group_scan_exclusive_max(double x);
+double  __ovld __conv sub_group_scan_inclusive_add(double x);
+double  __ovld __conv sub_group_scan_inclusive_min(double x);
+double  __ovld __conv sub_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64

 #endif //cl_khr_subgroups cl_intel_subgroups
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@ -5857,6 +5857,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
  case AttributeList::AT_NoDuplicate:
    handleSimpleAttribute<NoDuplicateAttr>(S, D, Attr);
    break;
+  case AttributeList::AT_Convergent:
+    handleSimpleAttribute<ConvergentAttr>(S, D, Attr);
+    break;
  case AttributeList::AT_NoInline:
    handleSimpleAttribute<NoInlineAttr>(S, D, Attr);
    break;
--- a/clang/test/CodeGenOpenCL/convergent.cl
+++ b/clang/test/CodeGenOpenCL/convergent.cl
@ -0,0 +1,118 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+void convfun(void) __attribute__((convergent));
+void non_convfun(void);
+void nodupfun(void) __attribute__((noduplicate));
+
+void f(void);
+void g(void);
+
+// Test two if's are merged and non_convfun duplicated.
+// The LLVM IR is equivalent to:
+//    if (a) {
+//      f();
+//      non_convfun();
+//      g();
+//    } else {
+//      non_convfun();
+//    }
+//
+// CHECK: define spir_func void @test_merge_if(i32 %[[a:.+]])
+// CHECK: %[[tobool:.+]] = icmp eq i32 %[[a]], 0
+// CHECK: br i1 %[[tobool]], label %[[if_end3_critedge:.+]], label %[[if_then:.+]]
+// CHECK: [[if_then]]:
+// CHECK: tail call spir_func void @f()
+// CHECK: tail call spir_func void @non_convfun()
+// CHECK: tail call spir_func void @g()
+// CHECK: br label %[[if_end3:.+]]
+// CHECK: [[if_end3_critedge]]:
+// CHECK: tail call spir_func void @non_convfun()
+// CHECK: br label %[[if_end3]]
+// CHECK: [[if_end3]]:
+// CHECK-LABEL: ret void
+
+void test_merge_if(int a) {
+  if (a) {
+    f();
+  }
+  non_convfun();
+  if (a) {
+    g();
+  }
+}
+
+// CHECK-DAG: declare spir_func void @f()
+// CHECK-DAG: declare spir_func void @non_convfun()
+// CHECK-DAG: declare spir_func void @g()
+
+// Test two if's are not merged.
+// CHECK: define spir_func void @test_no_merge_if(i32 %[[a:.+]])
+// CHECK:  %[[tobool:.+]] = icmp eq i32 %[[a]], 0
+// CHECK: br i1 %[[tobool]], label %[[if_end:.+]], label %[[if_then:.+]]
+// CHECK: [[if_then]]:
+// CHECK: tail call spir_func void @f()
+// CHECK-NOT: call spir_func void @convfun()
+// CHECK-NOT: call spir_func void @g()
+// CHECK: br label %[[if_end]]
+// CHECK: [[if_end]]:
+// CHECK:  %[[tobool_pr:.+]] = phi i1 [ true, %[[if_then]] ], [ false, %{{.+}} ]
+// CHECK:  tail call spir_func void @convfun() #[[attr5:.+]]
+// CHECK:  br i1 %[[tobool_pr]], label %[[if_then2:.+]], label %[[if_end3:.+]]
+// CHECK: [[if_then2]]:
+// CHECK: tail call spir_func void @g()
+// CHECK:  br label %[[if_end3:.+]]
+// CHECK: [[if_end3]]:
+// CHECK-LABEL:  ret void
+
+void test_no_merge_if(int a) {
+  if (a) {
+    f();
+  }
+  convfun();
+  if(a) {
+    g();
+  }
+}
+
+// CHECK: declare spir_func void @convfun(){{[^#]*}} #[[attr2:[0-9]+]]
+
+// Test loop is unrolled for convergent function.
+// CHECK-LABEL: define spir_func void @test_unroll()
+// CHECK:  tail call spir_func void @convfun() #[[attr5:[0-9]+]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK:  tail call spir_func void @convfun() #[[attr5]]
+// CHECK-LABEL:  ret void
+
+void test_unroll() {
+  for (int i = 0; i < 10; i++)
+    convfun();
+}
+
+// Test loop is not unrolled for noduplicate function.
+// CHECK-LABEL: define spir_func void @test_not_unroll()
+// CHECK:  br label %[[for_body:.+]]
+// CHECK: [[for_cond_cleanup:.+]]:
+// CHECK:  ret void
+// CHECK: [[for_body]]:
+// CHECK:  tail call spir_func void @nodupfun() #[[attr6:[0-9]+]]
+// CHECK-NOT: call spir_func void @nodupfun()
+// CHECK:  br i1 %{{.+}}, label %[[for_body]], label %[[for_cond_cleanup]]
+
+void test_not_unroll() {
+  for (int i = 0; i < 10; i++)
+    nodupfun();
+}
+
+// CHECK: declare spir_func void @nodupfun(){{[^#]*}} #[[attr3:[0-9]+]]
+
+// CHECK-DAG: attributes #[[attr2]] = { {{[^}]*}}convergent{{[^}]*}} }
+// CHECK-DAG: attributes #[[attr3]] = { {{[^}]*}}noduplicate{{[^}]*}} }
+// CHECK-DAG: attributes #[[attr5]] = { {{[^}]*}}convergent{{[^}]*}} }
+// CHECK-DAG: attributes #[[attr6]] = { {{[^}]*}}noduplicate{{[^}]*}} }
--- a/clang/test/SemaOpenCL/convergent.cl
+++ b/clang/test/SemaOpenCL/convergent.cl
@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -fsyntax-only -verify %s
+
+void f1(void) __attribute__((convergent));
+
+void f2(void) __attribute__((convergent(1))); // expected-error {{'convergent' attribute takes no arguments}}
+
+void f3(int a __attribute__((convergent))); // expected-warning {{'convergent' attribute only applies to functions}}
+
+void f4(void) {
+  int var1 __attribute__((convergent)); // expected-warning {{'convergent' attribute only applies to functions}}
+}
+