diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index b75891e4f21e..04de39566222 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -169,7 +169,7 @@ enum MachineConfiguration : unsigned { LaneIDMask = WarpSize - 1, /// Global memory alignment for performance. - GlobalMemoryAlignment = 256, + GlobalMemoryAlignment = 128, }; enum NamedBarrier : unsigned { @@ -186,20 +186,30 @@ static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) { static RecordDecl *buildRecordForGlobalizedVars( ASTContext &C, ArrayRef EscapedDecls, + ArrayRef EscapedDeclsForTeams, llvm::SmallDenseMap &MappedDeclsFields) { - if (EscapedDecls.empty()) + if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) return nullptr; SmallVector GlobalizedVars; for (const ValueDecl *D : EscapedDecls) + GlobalizedVars.emplace_back( + CharUnits::fromQuantity(std::max( + C.getDeclAlign(D).getQuantity(), + static_cast(GlobalMemoryAlignment))), + D); + for (const ValueDecl *D : EscapedDeclsForTeams) GlobalizedVars.emplace_back(C.getDeclAlign(D), D); std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(), stable_sort_comparator); // Build struct _globalized_locals_ty { - // /* globalized vars */ + // /* globalized vars */[32] align (max(decl_align, 128)) + // /* globalized vars */ for EscapedDeclsForTeams // }; RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); GlobalizedRD->startDefinition(); + llvm::SmallPtrSet SingleEscaped( + EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); for (const auto &Pair : GlobalizedVars) { const ValueDecl *VD = Pair.second; QualType Type = VD->getType(); @@ -208,19 +218,39 @@ static RecordDecl *buildRecordForGlobalizedVars( else Type = Type.getNonReferenceType(); SourceLocation Loc = VD->getLocation(); - auto *Field = - FieldDecl::Create(C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, - C.getTrivialTypeSourceInfo(Type, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - GlobalizedRD->addDecl(Field); - if (VD->hasAttrs()) { - for (specific_attr_iterator I(VD->getAttrs().begin()), - E(VD->getAttrs().end()); - I != E; ++I) - Field->addAttr(*I); + FieldDecl *Field; + if (SingleEscaped.count(VD)) { + Field = FieldDecl::Create( + C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, + C.getTrivialTypeSourceInfo(Type, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + if (VD->hasAttrs()) { + for (specific_attr_iterator I(VD->getAttrs().begin()), + E(VD->getAttrs().end()); + I != E; ++I) + Field->addAttr(*I); + } + } else { + llvm::APInt ArraySize(32, WarpSize); + Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0); + Field = FieldDecl::Create( + C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, + C.getTrivialTypeSourceInfo(Type, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(), + static_cast( + GlobalMemoryAlignment))); + Field->addAttr(AlignedAttr::CreateImplicit( + C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true, + IntegerLiteral::Create(C, Align, + C.getIntTypeForBitwidth(32, /*Signed=*/0), + SourceLocation()))); } + GlobalizedRD->addDecl(Field); MappedDeclsFields.try_emplace(VD, Field); } GlobalizedRD->completeDefinition(); @@ -344,7 +374,8 @@ class CheckVarsEscapingDeclContext final assert(!GlobalizedRD && "Record for globalized variables is built already."); GlobalizedRD = ::buildRecordForGlobalizedVars( - CGF.getContext(), EscapedDecls.getArrayRef(), MappedDeclsFields); + CGF.getContext(), EscapedDecls.getArrayRef(), llvm::None, + MappedDeclsFields); } public: @@ -1849,8 +1880,7 @@ getDistributeLastprivateVars(const OMPExecutableDirective &D, } if (!Dir) return; - for (const OMPLastprivateClause *C : - Dir->getClausesOfKind()) { + for (const auto *C : Dir->getClausesOfKind()) { for (const Expr *E : C->getVarRefs()) { const auto *DE = cast(E->IgnoreParens()); Vars.push_back(cast(DE->getDecl()->getCanonicalDecl())); @@ -1869,8 +1899,8 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { getDistributeLastprivateVars(D, LastPrivates); if (!LastPrivates.empty()) - GlobalizedRD = buildRecordForGlobalizedVars( - CGM.getContext(), LastPrivates, MappedDeclsFields); + GlobalizedRD = ::buildRecordForGlobalizedVars( + CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields); } // Emit target region as a standalone region. @@ -1899,9 +1929,9 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( for (const auto &Pair : MappedDeclsFields) { assert(Pair.getFirst()->isCanonicalDecl() && "Expected canonical declaration"); - Data.insert(std::make_pair( - Pair.getFirst(), - std::make_pair(Pair.getSecond(), Address::invalid()))); + Data.insert(std::make_pair(Pair.getFirst(), + MappedVarData(Pair.getSecond(), + /*IsOnePerTeam=*/true))); } } Rt.emitGenericVarsProlog(CGF, Loc); @@ -1935,18 +1965,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, if (I == FunctionGlobalizedDecls.end()) return; if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { - QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); + QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); // Recover pointer to this function's global record. The runtime will // handle the specifics of the allocation of the memory. // Use actual memory size of the record including the padding // for alignment purposes. unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RecTy).getQuantity(); + CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); + CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); + llvm::PointerType *GlobalRecPtrTy = + CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); llvm::Value *GlobalRecCastAddr; if (WithSPMDCheck || getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { @@ -1959,7 +1991,8 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, // There is no need to emit line number for unconditional branch. (void)ApplyDebugLocation::CreateEmpty(CGF); CGF.EmitBlock(SPMDBB); - Address RecPtr = CGF.CreateMemTemp(RecTy, "_local_stack"); + Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), + CharUnits::fromQuantity(Alignment)); CGF.EmitBranch(ExitBB); // There is no need to emit line number for unconditional branch. (void)ApplyDebugLocation::CreateEmpty(CGF); @@ -1974,9 +2007,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, OMPRTL_NVPTX__kmpc_data_sharing_push_stack), GlobalRecordSizeArg); GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + GlobalRecValue, GlobalRecPtrTy); CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecCastAddr->getType(), + auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, /*NumReservedValues=*/2, "_select_stack"); Phi->addIncoming(RecPtr.getPointer(), SPMDBB); Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); @@ -1994,12 +2027,12 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, OMPRTL_NVPTX__kmpc_data_sharing_push_stack), GlobalRecordSizeArg); GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + GlobalRecValue, GlobalRecPtrTy); I->getSecond().GlobalRecordAddr = GlobalRecValue; I->getSecond().IsInSPMDModeFlag = nullptr; } LValue Base = - CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy); + CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); // Emit the "global alloca" which is a GEP from the global declaration // record using the pointer returned by the runtime. @@ -2012,9 +2045,34 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); } - const FieldDecl *FD = Rec.second.first; - LValue VarAddr = CGF.EmitLValueForField(Base, FD); - Rec.second.second = VarAddr.getAddress(); + LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); + // Emit VarAddr basing on lane-id if required. + QualType VarTy; + if (Rec.second.IsOnePerTeam) { + Rec.second.PrivateAddr = VarAddr.getAddress(); + VarTy = Rec.second.FD->getType(); + } else { + llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( + VarAddr.getAddress().getPointer(), + {Bld.getInt32(0), getNVPTXLaneID(CGF)}); + Rec.second.PrivateAddr = + Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)); + VarTy = + Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); + VarAddr = CGF.MakeAddrLValue(Rec.second.PrivateAddr, VarTy, + AlignmentSource::Decl); + } + if (WithSPMDCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + assert(I->getSecond().IsInSPMDModeFlag && + "Expected unknown execution mode or required SPMD check."); + Address GlobalPtr = Rec.second.PrivateAddr; + Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); + Rec.second.PrivateAddr = Address( + Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, + LocalAddr.getPointer(), GlobalPtr.getPointer()), + LocalAddr.getAlignment()); + } if (EscapedParam) { const auto *VD = cast(Rec.first); CGF.EmitStoreOfScalar(ParValue, VarAddr); @@ -4047,7 +4105,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid()))); + Data.insert(std::make_pair(VD, MappedVarData(FD))); } if (!NeedToDelayGlobalization) { emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); @@ -4074,7 +4132,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, return Address::invalid(); auto VDI = I->getSecond().LocalVarData.find(VD); if (VDI != I->getSecond().LocalVarData.end()) - return VDI->second.second; + return VDI->second.PrivateAddr; if (VD->hasAttrs()) { for (specific_attr_iterator IT(VD->attr_begin()), E(VD->attr_end()); @@ -4083,7 +4141,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, cast(cast(IT->getRef())->getDecl()) ->getCanonicalDecl()); if (VDI != I->getSecond().LocalVarData.end()) - return VDI->second.second; + return VDI->second.PrivateAddr; } } return Address::invalid(); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index fc8cd2467bf2..cdc856bbcd6f 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -373,9 +373,21 @@ private: llvm::Function *createParallelDataSharingWrapper( llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D); + /// The data for the single globalized variable. + struct MappedVarData { + /// Corresponding field in the global record. + const FieldDecl * FD = nullptr; + /// Corresponding address. + Address PrivateAddr = Address::invalid(); + /// true, if only one element is required (for latprivates in SPMD mode), + /// false, if need to create based on the warp-size. + bool IsOnePerTeam = false; + MappedVarData() = delete; + MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false) + : FD(FD), IsOnePerTeam(IsOnePerTeam) {} + }; /// The map of local variables to their addresses in the global memory. - using DeclToAddrMapTy = llvm::MapVector>; + using DeclToAddrMapTy = llvm::MapVector; /// Set of the parameters passed by value escaping OpenMP context. using EscapedParamsTy = llvm::SmallPtrSet; struct FunctionData { diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp index 6b08993c1946..8d2a4da77379 100644 --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -35,16 +35,21 @@ int maini1() { // CHECK-NOT: @__kmpc_data_sharing_push_stack // CHECK: define {{.*}}[[BAR]]() -// CHECK: [[STACK:%.+]] = alloca [[GLOBAL_ST:%.+]], +// CHECK: alloca i32, +// CHECK: [[A_LOCAL_ADDR:%.+]] = alloca i32, // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() // CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 // CHECK: br i1 [[IS_SPMD]], label // CHECK: br label -// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0) -// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST]]* +// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) +// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%.+]]* // CHECK: br label -// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ [[STACK]], {{.+}} ], [ [[GLOBALS]], {{.+}} ] +// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[GLOBALS]], {{.+}} ] // CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 +// CHECK: [[A_GLOBAL_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ADDR]], i32 0, i32 [[LID]] +// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_GLOBAL_ADDR]] // CHECK: call {{.*}}[[FOO]](i32* dereferenceable{{.*}} [[A_ADDR]]) // CHECK: br i1 [[IS_SPMD]], label // CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp index 0acb11991527..832067692438 100644 --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -39,10 +39,16 @@ void test_ds(){ // CK1: [[SHAREDARGS2:%.+]] = alloca i8** // CK1: call void @__kmpc_kernel_init // CK1: call void @__kmpc_data_sharing_init_stack -// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 0) +// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 256, i16 0) // CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty* -// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 -// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 +// CK1: [[A_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 +// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK1: [[LID:%.+]] = and i32 [[TID]], 31 +// CK1: [[A:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ARR]], i32 0, i32 [[LID]] +// CK1: [[B_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 +// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK1: [[LID:%.+]] = and i32 [[TID]], 31 +// CK1: [[B:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[B_ARR]], i32 0, i32 [[LID]] // CK1: store i32 10, i32* [[A]] // CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1) // CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1) diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 2b18f6d3f9bd..2a7344f77340 100644 --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -26,10 +26,13 @@ int main(int argc, char **argv) { // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker( // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0) +// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 2688, i16 0) // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align -// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ARGC_ARR_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 +// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGC_ARR_ADDR]], i32 0, i32 [[LID]] // CHECK: store i32 [[ARGC]], i32* [[ARGC_ADDR]], // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 2 diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp index d1a3104407d0..978804d2495a 100644 --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -318,11 +318,14 @@ int bar(int n){ // CHECK-32: [[A_ADDR:%.+]] = alloca i32, // CHECK-64: [[A_ADDR:%.+]] = alloca i64, // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0) +// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 128, i16 0) // CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty* // CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]], // CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]], -// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: [[GLOBAL_A_ADDR_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 +// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[GLOBAL_A_ADDR_ARR]], i32 0, i32 [[LID]] // CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]], // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.*}}) diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp index 01f346121a4d..69c338995b7e 100644 --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -554,7 +554,8 @@ int baz(int f, double &a) { // CHECK: ret void // CHECK: define i32 [[BAZ]](i32 [[F:%.*]], double* dereferenceable{{.*}}) - // CHECK: [[STACK:%.+]] = alloca [[GLOBAL_ST:%.+]], + // CHECK: alloca i32, + // CHECK: [[LOCAL_F_PTR:%.+]] = alloca i32, // CHECK: [[ZERO_ADDR:%.+]] = alloca i32, // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* // CHECK: store i32 0, i32* [[ZERO_ADDR]] @@ -562,11 +563,15 @@ int baz(int f, double &a) { // CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 // CHECK: br i1 [[IS_SPMD]], label // CHECK: br label - // CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0) - // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST]]* + // CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 128, i16 0) + // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST:%.+]]* // CHECK: br label - // CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ [[STACK]], {{.+}} ], [ [[REC_ADDR]], {{.+}} ] - // CHECK: [[F_PTR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i32 0, i32 0 + // CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[REC_ADDR]], {{.+}} ] + // CHECK: [[F_PTR_ARR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i32 0, i32 0 + // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK: [[LID:%.+]] = and i32 [[TID]], 31 + // CHECK: [[GLOBAL_F_PTR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F_PTR_ARR]], i32 0, i32 [[LID]] + // CHECK: [[F_PTR:%.+]] = select i1 [[IS_SPMD]], i32* [[LOCAL_F_PTR]], i32* [[GLOBAL_F_PTR]] // CHECK: store i32 %{{.+}}, i32* [[F_PTR]], // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp index 4e3f2674fc5f..91b372c65b6a 100644 --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -36,10 +36,13 @@ int main (int argc, char **argv) { // CK1: store {{.+}} 0, {{.+}}, // CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], // CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* -// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0) +// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0) // CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK1: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK1: [[LID:%.+]] = and i32 [[TID]], 31 +// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] // CK1: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], // CK1: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], @@ -53,9 +56,12 @@ int main (int argc, char **argv) { // CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***, // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**, // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] -// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0) +// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0) // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK1: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK1: [[LID:%.+]] = and i32 [[TID]], 31 +// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] // CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], // CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]], // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]], @@ -111,10 +117,13 @@ int main (int argc, char **argv) { // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* -// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0) +// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0) // CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK2-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK2: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK2: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK2: [[LID:%.+]] = and i32 [[TID]], 31 +// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] // CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( // CK2: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], @@ -132,9 +141,12 @@ int main (int argc, char **argv) { // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]], -// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0) +// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0) // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK2: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK2: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CK2: [[LID:%.+]] = and i32 [[TID]], 31 +// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] // CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( // CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]], diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp index 6925b17163af..b73b5a8e93e5 100644 --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -597,9 +597,9 @@ int bar(int n){ // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 // // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], @@ -643,9 +643,9 @@ int bar(int n){ // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 1 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 // // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 4, [[TEAM]] // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] @@ -1024,9 +1024,9 @@ int bar(int n){ // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 // // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], @@ -1072,9 +1072,9 @@ int bar(int n){ // CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4 // CHECK: [[POS1:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[OF]] // CHECK: [[POS2:%.+]] = sub nuw i[[SZ]] [[POS1]], 1 - // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS3:%.+]] = udiv i[[SZ]] [[POS2]], 128 // CHECK: [[POS4:%.+]] = add nuw i[[SZ]] [[POS3]], 1 - // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 256 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul nuw i[[SZ]] [[POS4]], 128 // // CHECK: [[P:%.+]] = mul nuw i[[SZ]] 2, [[TEAM]] // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]