[OPENMP, NVPTX] Fix codegen for the teams reduction.

Added NUW flags for all the add|mul|sub operations + replaced sdiv by udiv
as we operate on unsigned values only (addresses, converted to integers)

llvm-svn: 329411
This commit is contained in:
Alexey Bataev 2018-04-06 16:03:36 +00:00
parent 63ae5579e7
commit e290ec02c7
6 changed files with 86 additions and 92 deletions

View File

@ -529,8 +529,8 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
CGBuilderTy &Bld = CGF.Builder;
return IsInSpmdExecutionMode
? getNVPTXNumThreads(CGF)
: Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
"thread_limit");
: Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
"thread_limit");
}
/// Get the thread id of the OMP master thread.
@ -545,9 +545,9 @@ static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
// We assume that the warp size is a power of 2.
llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
Bld.CreateNot(Mask), "master_tid");
}
@ -1714,13 +1714,11 @@ static void emitReductionListCopy(
// Step 1.2: Get the address for dest element:
// address = base + index * ElementSizeInChars.
unsigned ElementSizeInChars =
C.getTypeSizeInChars(Private->getType()).getQuantity();
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
auto *CurrentOffset =
Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
ScratchpadIndex);
Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
auto *ScratchPadElemAbsolutePtrVal =
Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
ScratchPadElemAbsolutePtrVal =
Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
@ -1731,13 +1729,11 @@ static void emitReductionListCopy(
case ScratchpadToThread: {
// Step 1.1: Get the address for the src element in the scratchpad.
// address = base + index * ElementSizeInChars.
unsigned ElementSizeInChars =
C.getTypeSizeInChars(Private->getType()).getQuantity();
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
auto *CurrentOffset =
Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
ScratchpadIndex);
Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
auto *ScratchPadElemAbsolutePtrVal =
Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
ScratchPadElemAbsolutePtrVal =
Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
@ -1796,22 +1792,20 @@ static void emitReductionListCopy(
if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
llvm::Value *ScratchpadBasePtr =
IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
unsigned ElementSizeInChars =
C.getTypeSizeInChars(Private->getType()).getQuantity();
ScratchpadBasePtr = Bld.CreateAdd(
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
ScratchpadBasePtr = Bld.CreateNUWAdd(
ScratchpadBasePtr,
Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
CGM.SizeTy, ElementSizeInChars)));
Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
// Take care of global memory alignment for performance
ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateSDiv(
ScratchpadBasePtr = Bld.CreateNUWSub(
ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateUDiv(
ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateMul(
ScratchpadBasePtr = Bld.CreateNUWAdd(
ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateNUWMul(
ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));

View File

@ -127,7 +127,7 @@ int bar(int n){
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -145,7 +145,7 @@ int bar(int n){
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
@ -246,7 +246,7 @@ int bar(int n){
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -264,7 +264,7 @@ int bar(int n){
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]],
// CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]],

View File

@ -68,7 +68,7 @@ int foo(int n) {
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -86,7 +86,7 @@ int foo(int n) {
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK: br label {{%?}}[[TERMINATE:.+]]
//
@ -144,7 +144,7 @@ int foo(int n) {
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -162,7 +162,7 @@ int foo(int n) {
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK: load i16, i16* [[AA_CADDR]],
// CHECK: br label {{%?}}[[TERMINATE:.+]]
@ -245,7 +245,7 @@ int foo(int n) {
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -263,7 +263,7 @@ int foo(int n) {
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
//
// Use captures.
@ -414,7 +414,7 @@ int bar(int n){
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -432,7 +432,7 @@ int bar(int n){
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK-64-DAG: load i32, i32* [[REF_A]]
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
@ -503,7 +503,7 @@ int bar(int n){
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -521,7 +521,7 @@ int bar(int n){
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// Use captures.
// CHECK-DAG: getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0
@ -587,7 +587,7 @@ int bar(int n){
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -605,7 +605,7 @@ int bar(int n){
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
//
// CHECK-64-DAG: load i32, i32* [[REF_A]]

View File

@ -29,7 +29,7 @@ int CheckSimple() {
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// printf in master-only basic block.
@ -69,7 +69,7 @@ void CheckNoArgs() {
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// printf in master-only basic block.
@ -102,7 +102,7 @@ void CheckAllocaIsInEntryBlock() {
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
if (foo) {

View File

@ -93,7 +93,7 @@ int bar(int n){
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -111,7 +111,7 @@ int bar(int n){
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
//
// CHECK-NOT: kmpc_fork_teams
@ -179,7 +179,7 @@ int bar(int n){
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub nuw i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
@ -197,7 +197,7 @@ int bar(int n){
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
//
// CHECK-NOT: kmpc_fork_teams

View File

@ -246,8 +246,8 @@ int bar(int n){
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 8, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
// CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
@ -272,8 +272,8 @@ int bar(int n){
// CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
// CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 8, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
@ -588,24 +588,24 @@ int bar(int n){
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 1, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
// CHECK: store i8 [[ELT_VAL]], i8* [[SCRATCHPAD_ELT_PTR]], align
//
// CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1
// CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
// CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1
// CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
// CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
@ -631,8 +631,8 @@ int bar(int n){
// CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
// CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 1, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
@ -640,15 +640,15 @@ int bar(int n){
// CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[REMOTE_ELT1]], align
// CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
//
// CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1
// CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
// CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1
// CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
@ -1013,26 +1013,26 @@ int bar(int n){
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
// CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
// CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align
//
// CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4
// CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
// CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4
// CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 2, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
@ -1058,8 +1058,8 @@ int bar(int n){
// CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align
// CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]]
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
@ -1069,15 +1069,15 @@ int bar(int n){
// CHECK: [[REMOTE_ELT1_PTR:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
// CHECK: store i8* [[REMOTE_ELT1_PTR]], i8** [[REMOTE_ELT_REF]], align
//
// CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4
// CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256
// CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4
// CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]]
// CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1
// CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256
// CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1
// CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256
//
// CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[P:%.+]] = mul nuw i[[SZ]] 2, [[TEAM]]
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1