[OPENMP] Codegen for reduction clauses in 'taskloop' directives.

Adds codegen for taskloop-based directives. llvm-svn: 308174
2017-07-17 13:30:36 +00:00 · 2017-07-17 13:30:36 +00:00 · be5a8b42cd
parent dd2c29ef83
commit be5a8b42cd
16 changed files with 1066 additions and 54 deletions
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@ -643,6 +643,12 @@ enum OpenMPRTLFunction {
  // Call to void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
  // *vec);
  OMPRTL__kmpc_doacross_wait,
+  // Call to void *__kmpc_task_reduction_init(int gtid, int num_data, void
+  // *data);
+  OMPRTL__kmpc_task_reduction_init,
+  // Call to void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void
+  // *d);
+  OMPRTL__kmpc_task_reduction_get_th_data,

  //
  // Offloading related calls
@ -766,8 +772,8 @@ static void emitInitWithReductionInitializer(CodeGenFunction &CGF,
 /// \param SrcAddr Address of the original array.
 static void EmitOMPAggregateInit(CodeGenFunction &CGF, Address DestAddr,
                                 QualType Type, const Expr *Init,
+                                 const OMPDeclareReductionDecl *DRD,
                                 Address SrcAddr = Address::invalid()) {
-  auto *DRD = getReductionInit(Init);
  // Perform element-by-element initialization.
  QualType ElementTy;

@ -869,19 +875,17 @@ LValue ReductionCodeGen::emitSharedLValueUB(CodeGenFunction &CGF,
  return LValue();
 }

-void ReductionCodeGen::emitAggregateInitialization(CodeGenFunction &CGF,
-                                                   unsigned N,
-                                                   Address PrivateAddr,
-                                                   LValue SharedLVal) {
+void ReductionCodeGen::emitAggregateInitialization(
+    CodeGenFunction &CGF, unsigned N, Address PrivateAddr, LValue SharedLVal,
+    const OMPDeclareReductionDecl *DRD) {
  // Emit VarDecl with copy init for arrays.
  // Get the address of the original variable captured in current
  // captured region.
  auto *PrivateVD =
      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl());
-  auto *DRD = getReductionInit(ClausesData[N].ReductionOp);
  EmitOMPAggregateInit(CGF, PrivateAddr, PrivateVD->getType(),
                       DRD ? ClausesData[N].ReductionOp : PrivateVD->getInit(),
-                       SharedLVal.getAddress());
+                       DRD, SharedLVal.getAddress());
 }

 ReductionCodeGen::ReductionCodeGen(ArrayRef<const Expr *> Shareds,
@ -890,6 +894,7 @@ ReductionCodeGen::ReductionCodeGen(ArrayRef<const Expr *> Shareds,
  ClausesData.reserve(Shareds.size());
  SharedAddresses.reserve(Shareds.size());
  Sizes.reserve(Shareds.size());
+  BaseDecls.reserve(Shareds.size());
  auto IPriv = Privates.begin();
  auto IRed = ReductionOps.begin();
  for (const auto *Ref : Shareds) {
@ -912,20 +917,30 @@ void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N) {
  QualType PrivateType = PrivateVD->getType();
  bool AsArraySection = isa<OMPArraySectionExpr>(ClausesData[N].Ref);
  if (!AsArraySection && !PrivateType->isVariablyModifiedType()) {
-    Sizes.emplace_back(nullptr);
+    Sizes.emplace_back(
+        CGF.getTypeSize(
+            SharedAddresses[N].first.getType().getNonReferenceType()),
+        nullptr);
    return;
  }
  llvm::Value *Size;
+  llvm::Value *SizeInChars;
+  llvm::Type *ElemType =
+      cast<llvm::PointerType>(SharedAddresses[N].first.getPointer()->getType())
+          ->getElementType();
+  auto *ElemSizeOf = llvm::ConstantExpr::getSizeOf(ElemType);
  if (AsArraySection) {
    Size = CGF.Builder.CreatePtrDiff(SharedAddresses[N].second.getPointer(),
                                     SharedAddresses[N].first.getPointer());
    Size = CGF.Builder.CreateNUWAdd(
        Size, llvm::ConstantInt::get(Size->getType(), /*V=*/1));
+    SizeInChars = CGF.Builder.CreateNUWMul(Size, ElemSizeOf);
  } else {
-    Size = CGF.getTypeSize(
+    SizeInChars = CGF.getTypeSize(
        SharedAddresses[N].first.getType().getNonReferenceType());
+    Size = CGF.Builder.CreateExactUDiv(SizeInChars, ElemSizeOf);
  }
-  Sizes.emplace_back(Size);
+  Sizes.emplace_back(SizeInChars, Size);
  CodeGenFunction::OpaqueValueMapping OpaqueMap(
      CGF,
      cast<OpaqueValueExpr>(
@ -941,7 +956,7 @@ void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N,
  QualType PrivateType = PrivateVD->getType();
  bool AsArraySection = isa<OMPArraySectionExpr>(ClausesData[N].Ref);
  if (!AsArraySection && !PrivateType->isVariablyModifiedType()) {
-    assert(!Size && !Sizes[N] &&
+    assert(!Size && !Sizes[N].second &&
           "Size should be nullptr for non-variably modified redution "
           "items.");
    return;
@ -971,7 +986,7 @@ void ReductionCodeGen::emitInitialization(
      SharedType, SharedAddresses[N].first.getBaseInfo());
  if (isa<OMPArraySectionExpr>(ClausesData[N].Ref) ||
      CGF.getContext().getAsArrayType(PrivateVD->getType())) {
-    emitAggregateInitialization(CGF, N, PrivateAddr, SharedLVal);
+    emitAggregateInitialization(CGF, N, PrivateAddr, SharedLVal, DRD);
  } else if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
    emitInitWithReductionInitializer(CGF, DRD, ClausesData[N].ReductionOp,
                                     PrivateAddr, SharedLVal.getAddress(),
@ -1091,6 +1106,11 @@ Address ReductionCodeGen::adjustPrivateAddress(CodeGenFunction &CGF, unsigned N,
  return PrivateAddr;
 }

+bool ReductionCodeGen::usesReductionInitializer(unsigned N) const {
+  auto *DRD = getReductionInit(ClausesData[N].ReductionOp);
+  return DRD && DRD->getInitializer();
+}
+
 LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) {
  return CGF.EmitLoadOfPointerLValue(
      CGF.GetAddrOfLocalVar(getThreadIDVariable()),
@ -1948,6 +1968,26 @@ CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
    RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_wait");
    break;
  }
+  case OMPRTL__kmpc_task_reduction_init: {
+    // Build void *__kmpc_task_reduction_init(int gtid, int num_data, void
+    // *data);
+    llvm::Type *TypeParams[] = {CGM.IntTy, CGM.IntTy, CGM.VoidPtrTy};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
+    RTLFn =
+        CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_task_reduction_init");
+    break;
+  }
+  case OMPRTL__kmpc_task_reduction_get_th_data: {
+    // Build void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void
+    // *d);
+    llvm::Type *TypeParams[] = {CGM.IntTy, CGM.VoidPtrTy, CGM.VoidPtrTy};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(
+        FnTy, /*Name=*/"__kmpc_task_reduction_get_th_data");
+    break;
+  }
  case OMPRTL__tgt_target: {
    // Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
    // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
@ -2298,6 +2338,27 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
  return nullptr;
 }

+Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF,
+                                                          QualType VarType,
+                                                          StringRef Name) {
+  llvm::Twine VarName(Name, ".artificial.");
+  llvm::Type *VarLVType = CGF.ConvertTypeForMem(VarType);
+  llvm::Value *GAddr = getOrCreateInternalVariable(VarLVType, VarName);
+  llvm::Value *Args[] = {
+      emitUpdateLocation(CGF, SourceLocation()),
+      getThreadID(CGF, SourceLocation()),
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(GAddr, CGM.VoidPtrTy),
+      CGF.Builder.CreateIntCast(CGF.getTypeSize(VarType), CGM.SizeTy,
+                                /*IsSigned=*/false),
+      getOrCreateInternalVariable(CGM.VoidPtrPtrTy, VarName + ".cache.")};
+  return Address(
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.EmitRuntimeCall(
+              createRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args),
+          VarLVType->getPointerTo(/*AddrSpace=*/0)),
+      CGM.getPointerAlign());
+}
+
 /// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
 /// function. Here is the logic:
 /// if (Cond) {
@ -3093,6 +3154,8 @@ enum KmpTaskTFields {
  KmpTaskTStride,
  /// (Taskloops only) Is last iteration flag.
  KmpTaskTLastIter,
+  /// (Taskloops only) Reduction data.
+  KmpTaskTReductions,
 };
 } // anonymous namespace

@ -3644,6 +3707,7 @@ createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
  //         kmp_uint64          ub;
  //         kmp_int64           st;
  //         kmp_int32           liter;
+  //         void *              reductions;
  //       };
  auto *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
  UD->startDefinition();
@ -3667,6 +3731,7 @@ createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
    addFieldToRecordDecl(C, RD, KmpUInt64Ty);
    addFieldToRecordDecl(C, RD, KmpInt64Ty);
    addFieldToRecordDecl(C, RD, KmpInt32Ty);
+    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
  }
  RD->completeDefinition();
  return RD;
@ -3697,7 +3762,7 @@ createKmpTaskTWithPrivatesRecordDecl(CodeGenModule &CGM, QualType KmpTaskTQTy,
 ///   TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, tt,
 ///   For taskloops:
 ///   tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
-///   tt->shareds);
+///   tt->reductions, tt->shareds);
 ///   return 0;
 /// }
 /// \endcode
@ -3783,10 +3848,14 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
    auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
    auto LILVal = CGF.EmitLValueForField(Base, *LIFI);
    auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal();
+    auto RFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTReductions);
+    auto RLVal = CGF.EmitLValueForField(Base, *RFI);
+    auto *RParam = CGF.EmitLoadOfLValue(RLVal, Loc).getScalarVal();
    CallArgs.push_back(LBParam);
    CallArgs.push_back(UBParam);
    CallArgs.push_back(StParam);
    CallArgs.push_back(LIParam);
+    CallArgs.push_back(RParam);
  }
  CallArgs.push_back(SharedsParam);

@ -4549,6 +4618,16 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
      cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
  CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
                       /*IsInitializer=*/true);
+  // Store reductions address.
+  LValue RedLVal = CGF.EmitLValueForField(
+      Result.TDBase,
+      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTReductions));
+  if (Data.Reductions)
+    CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
+  else {
+    CGF.EmitNullInitialization(RedLVal.getAddress(),
+                               CGF.getContext().VoidPtrTy);
+  }
  enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
  llvm::Value *TaskArgs[] = {
      UpLoc,
@ -5074,6 +5153,353 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
  CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
 }

+/// Generates unique name for artificial threadprivate variables.
+/// Format is: <Prefix> "." <Loc_raw_encoding> "_" <N>
+static std::string generateUniqueName(StringRef Prefix, SourceLocation Loc,
+                                      unsigned N) {
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  Out << Prefix << "." << Loc.getRawEncoding() << "_" << N;
+  return Out.str();
+}
+
+/// Emits reduction initializer function:
+/// \code
+/// void @.red_init(void* %arg) {
+/// %0 = bitcast void* %arg to <type>*
+/// store <type> <init>, <type>* %0
+/// ret void
+/// }
+/// \endcode
+static llvm::Value *emitReduceInitFunction(CodeGenModule &CGM,
+                                           SourceLocation Loc,
+                                           ReductionCodeGen &RCG, unsigned N) {
+  auto &C = CGM.getContext();
+  FunctionArgList Args;
+  ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  Args.emplace_back(&Param);
+  auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
+                                    ".red_init.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
+  CodeGenFunction CGF(CGM);
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  Address PrivateAddr = CGF.EmitLoadOfPointer(
+      CGF.GetAddrOfLocalVar(&Param),
+      C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+  llvm::Value *Size = nullptr;
+  // If the size of the reduction item is non-constant, load it from global
+  // threadprivate variable.
+  if (RCG.getSizes(N).second) {
+    Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    Size =
+        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                             CGM.getContext().getSizeType(), SourceLocation());
+  }
+  RCG.emitAggregateType(CGF, N, Size);
+  LValue SharedLVal;
+  // If initializer uses initializer from declare reduction construct, emit a
+  // pointer to the address of the original reduction item (reuired by reduction
+  // initializer)
+  if (RCG.usesReductionInitializer(N)) {
+    Address SharedAddr =
+        CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+            CGF, CGM.getContext().VoidPtrTy,
+            generateUniqueName("reduction", Loc, N));
+    SharedLVal = CGF.MakeAddrLValue(SharedAddr, CGM.getContext().VoidPtrTy);
+  } else {
+    SharedLVal = CGF.MakeNaturalAlignAddrLValue(
+        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
+        CGM.getContext().VoidPtrTy);
+  }
+  // Emit the initializer:
+  // %0 = bitcast void* %arg to <type>*
+  // store <type> <init>, <type>* %0
+  RCG.emitInitialization(CGF, N, PrivateAddr, SharedLVal,
+                         [](CodeGenFunction &) { return false; });
+  CGF.FinishFunction();
+  return Fn;
+}
+
+/// Emits reduction combiner function:
+/// \code
+/// void @.red_comb(void* %arg0, void* %arg1) {
+/// %lhs = bitcast void* %arg0 to <type>*
+/// %rhs = bitcast void* %arg1 to <type>*
+/// %2 = <ReductionOp>(<type>* %lhs, <type>* %rhs)
+/// store <type> %2, <type>* %lhs
+/// ret void
+/// }
+/// \endcode
+static llvm::Value *emitReduceCombFunction(CodeGenModule &CGM,
+                                           SourceLocation Loc,
+                                           ReductionCodeGen &RCG, unsigned N,
+                                           const Expr *ReductionOp,
+                                           const Expr *LHS, const Expr *RHS,
+                                           const Expr *PrivateRef) {
+  auto &C = CGM.getContext();
+  auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(LHS)->getDecl());
+  auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(RHS)->getDecl());
+  FunctionArgList Args;
+  ImplicitParamDecl ParamInOut(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ParamIn(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  Args.emplace_back(&ParamInOut);
+  Args.emplace_back(&ParamIn);
+  auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
+                                    ".red_comb.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
+  CodeGenFunction CGF(CGM);
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  llvm::Value *Size = nullptr;
+  // If the size of the reduction item is non-constant, load it from global
+  // threadprivate variable.
+  if (RCG.getSizes(N).second) {
+    Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    Size =
+        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                             CGM.getContext().getSizeType(), SourceLocation());
+  }
+  RCG.emitAggregateType(CGF, N, Size);
+  // Remap lhs and rhs variables to the addresses of the function arguments.
+  // %lhs = bitcast void* %arg0 to <type>*
+  // %rhs = bitcast void* %arg1 to <type>*
+  CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+  PrivateScope.addPrivate(LHSVD, [&C, &CGF, &ParamInOut, LHSVD]() -> Address {
+    // Pull out the pointer to the variable.
+    Address PtrAddr = CGF.EmitLoadOfPointer(
+        CGF.GetAddrOfLocalVar(&ParamInOut),
+        C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+    return CGF.Builder.CreateElementBitCast(
+        PtrAddr, CGF.ConvertTypeForMem(LHSVD->getType()));
+  });
+  PrivateScope.addPrivate(RHSVD, [&C, &CGF, &ParamIn, RHSVD]() -> Address {
+    // Pull out the pointer to the variable.
+    Address PtrAddr = CGF.EmitLoadOfPointer(
+        CGF.GetAddrOfLocalVar(&ParamIn),
+        C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+    return CGF.Builder.CreateElementBitCast(
+        PtrAddr, CGF.ConvertTypeForMem(RHSVD->getType()));
+  });
+  PrivateScope.Privatize();
+  // Emit the combiner body:
+  // %2 = <ReductionOp>(<type> *%lhs, <type> *%rhs)
+  // store <type> %2, <type>* %lhs
+  CGM.getOpenMPRuntime().emitSingleReductionCombiner(
+      CGF, ReductionOp, PrivateRef, cast<DeclRefExpr>(LHS),
+      cast<DeclRefExpr>(RHS));
+  CGF.FinishFunction();
+  return Fn;
+}
+
+/// Emits reduction finalizer function:
+/// \code
+/// void @.red_fini(void* %arg) {
+/// %0 = bitcast void* %arg to <type>*
+/// <destroy>(<type>* %0)
+/// ret void
+/// }
+/// \endcode
+static llvm::Value *emitReduceFiniFunction(CodeGenModule &CGM,
+                                           SourceLocation Loc,
+                                           ReductionCodeGen &RCG, unsigned N) {
+  if (!RCG.needCleanups(N))
+    return nullptr;
+  auto &C = CGM.getContext();
+  FunctionArgList Args;
+  ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  Args.emplace_back(&Param);
+  auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
+                                    ".red_fini.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
+  CodeGenFunction CGF(CGM);
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  Address PrivateAddr = CGF.EmitLoadOfPointer(
+      CGF.GetAddrOfLocalVar(&Param),
+      C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+  llvm::Value *Size = nullptr;
+  // If the size of the reduction item is non-constant, load it from global
+  // threadprivate variable.
+  if (RCG.getSizes(N).second) {
+    Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    Size =
+        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                             CGM.getContext().getSizeType(), SourceLocation());
+  }
+  RCG.emitAggregateType(CGF, N, Size);
+  // Emit the finalizer body:
+  // <destroy>(<type>* %0)
+  RCG.emitCleanups(CGF, N, PrivateAddr);
+  CGF.FinishFunction();
+  return Fn;
+}
+
+llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
+    CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> LHSExprs,
+    ArrayRef<const Expr *> RHSExprs, const OMPTaskDataTy &Data) {
+  if (!CGF.HaveInsertPoint() || Data.ReductionVars.empty())
+    return nullptr;
+
+  // Build typedef struct:
+  // kmp_task_red_input {
+  //   void *reduce_shar; // shared reduction item
+  //   size_t reduce_size; // size of data item
+  //   void *reduce_init; // data initialization routine
+  //   void *reduce_fini; // data finalization routine
+  //   void *reduce_comb; // data combiner routine
+  //   kmp_task_red_flags_t flags; // flags for additional info from compiler
+  // } kmp_task_red_input_t;
+  ASTContext &C = CGM.getContext();
+  auto *RD = C.buildImplicitRecord("kmp_task_red_input_t");
+  RD->startDefinition();
+  const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType());
+  const FieldDecl *InitFD  = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FlagsFD = addFieldToRecordDecl(
+      C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false));
+  RD->completeDefinition();
+  QualType RDType = C.getRecordType(RD);
+  unsigned Size = Data.ReductionVars.size();
+  llvm::APInt ArraySize(/*numBits=*/64, Size);
+  QualType ArrayRDType = C.getConstantArrayType(
+      RDType, ArraySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
+  // kmp_task_red_input_t .rd_input.[Size];
+  Address TaskRedInput = CGF.CreateMemTemp(ArrayRDType, ".rd_input.");
+  ReductionCodeGen RCG(Data.ReductionVars, Data.ReductionCopies,
+                       Data.ReductionOps);
+  for (unsigned Cnt = 0; Cnt < Size; ++Cnt) {
+    // kmp_task_red_input_t &ElemLVal = .rd_input.[Cnt];
+    llvm::Value *Idxs[] = {llvm::ConstantInt::get(CGM.SizeTy, /*V=*/0),
+                           llvm::ConstantInt::get(CGM.SizeTy, Cnt)};
+    llvm::Value *GEP = CGF.EmitCheckedInBoundsGEP(
+        TaskRedInput.getPointer(), Idxs,
+        /*SignedIndices=*/false, /*IsSubtraction=*/false, Loc,
+        ".rd_input.gep.");
+    LValue ElemLVal = CGF.MakeNaturalAlignAddrLValue(GEP, RDType);
+    // ElemLVal.reduce_shar = &Shareds[Cnt];
+    LValue SharedLVal = CGF.EmitLValueForField(ElemLVal, SharedFD);
+    RCG.emitSharedLValue(CGF, Cnt);
+    llvm::Value *CastedShared =
+        CGF.EmitCastToVoidPtr(RCG.getSharedLValue(Cnt).getPointer());
+    CGF.EmitStoreOfScalar(CastedShared, SharedLVal);
+    RCG.emitAggregateType(CGF, Cnt);
+    llvm::Value *SizeValInChars;
+    llvm::Value *SizeVal;
+    std::tie(SizeValInChars, SizeVal) = RCG.getSizes(Cnt);
+    // We use delayed creation/initialization for VLAs, array sections and
+    // custom reduction initializations. It is required because runtime does not
+    // provide the way to pass the sizes of VLAs/array sections to
+    // initializer/combiner/finalizer functions and does not pass the pointer to
+    // original reduction item to the initializer. Instead threadprivate global
+    // variables are used to store these values and use them in the functions.
+    bool DelayedCreation = !!SizeVal;
+    SizeValInChars = CGF.Builder.CreateIntCast(SizeValInChars, CGM.SizeTy,
+                                               /*isSigned=*/false);
+    LValue SizeLVal = CGF.EmitLValueForField(ElemLVal, SizeFD);
+    CGF.EmitStoreOfScalar(SizeValInChars, SizeLVal);
+    // ElemLVal.reduce_init = init;
+    LValue InitLVal = CGF.EmitLValueForField(ElemLVal, InitFD);
+    llvm::Value *InitAddr =
+        CGF.EmitCastToVoidPtr(emitReduceInitFunction(CGM, Loc, RCG, Cnt));
+    CGF.EmitStoreOfScalar(InitAddr, InitLVal);
+    DelayedCreation = DelayedCreation || RCG.usesReductionInitializer(Cnt);
+    // ElemLVal.reduce_fini = fini;
+    LValue FiniLVal = CGF.EmitLValueForField(ElemLVal, FiniFD);
+    llvm::Value *Fini = emitReduceFiniFunction(CGM, Loc, RCG, Cnt);
+    llvm::Value *FiniAddr = Fini
+                                ? CGF.EmitCastToVoidPtr(Fini)
+                                : llvm::ConstantPointerNull::get(CGM.VoidPtrTy);
+    CGF.EmitStoreOfScalar(FiniAddr, FiniLVal);
+    // ElemLVal.reduce_comb = comb;
+    LValue CombLVal = CGF.EmitLValueForField(ElemLVal, CombFD);
+    llvm::Value *CombAddr = CGF.EmitCastToVoidPtr(emitReduceCombFunction(
+        CGM, Loc, RCG, Cnt, Data.ReductionOps[Cnt], LHSExprs[Cnt],
+        RHSExprs[Cnt], Data.ReductionCopies[Cnt]));
+    CGF.EmitStoreOfScalar(CombAddr, CombLVal);
+    // ElemLVal.flags = 0;
+    LValue FlagsLVal = CGF.EmitLValueForField(ElemLVal, FlagsFD);
+    if (DelayedCreation) {
+      CGF.EmitStoreOfScalar(
+          llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1, /*IsSigned=*/true),
+          FlagsLVal);
+    } else
+      CGF.EmitNullInitialization(FlagsLVal.getAddress(), FlagsLVal.getType());
+  }
+  // Build call void *__kmpc_task_reduction_init(int gtid, int num_data, void
+  // *data);
+  llvm::Value *Args[] = {
+      CGF.Builder.CreateIntCast(getThreadID(CGF, Loc), CGM.IntTy,
+                                /*isSigned=*/true),
+      llvm::ConstantInt::get(CGM.IntTy, Size, /*isSigned=*/true),
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TaskRedInput.getPointer(),
+                                                      CGM.VoidPtrTy)};
+  return CGF.EmitRuntimeCall(
+      createRuntimeFunction(OMPRTL__kmpc_task_reduction_init), Args);
+}
+
+void CGOpenMPRuntime::emitTaskReductionFixups(CodeGenFunction &CGF,
+                                              SourceLocation Loc,
+                                              ReductionCodeGen &RCG,
+                                              unsigned N) {
+  auto Sizes = RCG.getSizes(N);
+  // Emit threadprivate global variable if the type is non-constant
+  // (Sizes.second = nullptr).
+  if (Sizes.second) {
+    llvm::Value *SizeVal = CGF.Builder.CreateIntCast(Sizes.second, CGM.SizeTy,
+                                                     /*isSigned=*/false);
+    Address SizeAddr = getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    CGF.Builder.CreateStore(SizeVal, SizeAddr, /*IsVolatile=*/false);
+  }
+  // Store address of the original reduction item if custom initializer is used.
+  if (RCG.usesReductionInitializer(N)) {
+    Address SharedAddr = getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().VoidPtrTy,
+        generateUniqueName("reduction", Loc, N));
+    CGF.Builder.CreateStore(
+        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+            RCG.getSharedLValue(N).getPointer(), CGM.VoidPtrTy),
+        SharedAddr, /*IsVolatile=*/false);
+  }
+}
+
+Address CGOpenMPRuntime::getTaskReductionItem(CodeGenFunction &CGF,
+                                              SourceLocation Loc,
+                                              llvm::Value *ReductionsPtr,
+                                              LValue SharedLVal) {
+  // Build call void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void
+  // *d);
+  llvm::Value *Args[] = {
+      CGF.Builder.CreateIntCast(getThreadID(CGF, Loc), CGM.IntTy,
+                                /*isSigned=*/true),
+      ReductionsPtr,
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(SharedLVal.getPointer(),
+                                                      CGM.VoidPtrTy)};
+  return Address(
+      CGF.EmitRuntimeCall(
+          createRuntimeFunction(OMPRTL__kmpc_task_reduction_get_th_data), Args),
+      SharedLVal.getAlignment());
+}
+
 void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF,
                                       SourceLocation Loc) {
  if (!CGF.HaveInsertPoint())
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@ -96,10 +96,14 @@ struct OMPTaskDataTy final {
  SmallVector<const Expr *, 4> FirstprivateInits;
  SmallVector<const Expr *, 4> LastprivateVars;
  SmallVector<const Expr *, 4> LastprivateCopies;
+  SmallVector<const Expr *, 4> ReductionVars;
+  SmallVector<const Expr *, 4> ReductionCopies;
+  SmallVector<const Expr *, 4> ReductionOps;
  SmallVector<std::pair<OpenMPDependClauseKind, const Expr *>, 4> Dependences;
  llvm::PointerIntPair<llvm::Value *, 1, bool> Final;
  llvm::PointerIntPair<llvm::Value *, 1, bool> Schedule;
  llvm::PointerIntPair<llvm::Value *, 1, bool> Priority;
+  llvm::Value *Reductions = nullptr;
  unsigned NumberOfParts = 0;
  bool Tied = true;
  bool Nogroup = false;
@ -125,9 +129,10 @@ private:
  /// List of addresses of original shared variables/expressions.
  SmallVector<std::pair<LValue, LValue>, 4> SharedAddresses;
  /// Sizes of the reduction items in chars.
-  SmallVector<llvm::Value *, 4> Sizes;
+  SmallVector<std::pair<llvm::Value *, llvm::Value *>, 4> Sizes;
  /// Base declarations for the reduction items.
  SmallVector<const VarDecl *, 4> BaseDecls;
+
  /// Emits lvalue for shared expresion.
  LValue emitSharedLValue(CodeGenFunction &CGF, const Expr *E);
  /// Emits upper bound for shared expression (if array section).
@ -135,9 +140,11 @@ private:
  /// Performs aggregate initialization.
  /// \param N Number of reduction item in the common list.
  /// \param PrivateAddr Address of the corresponding private item.
-  /// \param SharedLVal Addreiss of the original shared variable.
+  /// \param SharedLVal Address of the original shared variable.
+  /// \param DRD Declare reduction construct used for reduction item.
  void emitAggregateInitialization(CodeGenFunction &CGF, unsigned N,
-                                   Address PrivateAddr, LValue SharedLVal);
+                                   Address PrivateAddr, LValue SharedLVal,
+                                   const OMPDeclareReductionDecl *DRD);

 public:
  ReductionCodeGen(ArrayRef<const Expr *> Shareds,
@ -177,11 +184,16 @@ public:
                               Address PrivateAddr);
  /// Returns LValue for the reduction item.
  LValue getSharedLValue(unsigned N) const { return SharedAddresses[N].first; }
-  /// Returns the size of the reduction item in chars, or nullptr, if the size
-  /// is a constant.
-  llvm::Value *getSizeInChars(unsigned N) const { return Sizes[N]; }
+  /// Returns the size of the reduction item (in chars and total number of
+  /// elements in the item), or nullptr, if the size is a constant.
+  std::pair<llvm::Value *, llvm::Value *> getSizes(unsigned N) const {
+    return Sizes[N];
+  }
  /// Returns the base declaration of the reduction item.
  const VarDecl *getBaseDecl(unsigned N) const { return BaseDecls[N]; }
+  /// Returns true if the initialization of the reduction item uses initializer
+  /// from declare reduction construct.
+  bool usesReductionInitializer(unsigned N) const;
 };

 class CGOpenMPRuntime {
@ -923,6 +935,14 @@ public:
                                 SourceLocation Loc, bool PerformInit,
                                 CodeGenFunction *CGF = nullptr);

+  /// Creates artificial threadprivate variable with name \p Name and type \p
+  /// VarType.
+  /// \param VarType Type of the artificial threadprivate variable.
+  /// \param Name Name of the artificial threadprivate variable.
+  virtual Address getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF,
+                                                   QualType VarType,
+                                                   StringRef Name);
+
  /// \brief Emit flush of the variables specified in 'omp flush' directive.
  /// \param Vars List of variables to flush.
  virtual void emitFlush(CodeGenFunction &CGF, ArrayRef<const Expr *> Vars,
@ -1081,6 +1101,51 @@ public:
                             ArrayRef<const Expr *> ReductionOps,
                             ReductionOptionsTy Options);

+  /// Emit a code for initialization of task reduction clause. Next code
+  /// should be emitted for reduction:
+  /// \code
+  ///
+  /// _task_red_item_t red_data[n];
+  /// ...
+  /// red_data[i].shar = &origs[i];
+  /// red_data[i].size = sizeof(origs[i]);
+  /// red_data[i].f_init = (void*)RedInit<i>;
+  /// red_data[i].f_fini = (void*)RedDest<i>;
+  /// red_data[i].f_comb = (void*)RedOp<i>;
+  /// red_data[i].flags = <Flag_i>;
+  /// ...
+  /// void* tg1 = __kmpc_task_reduction_init(gtid, n, red_data);
+  /// \endcode
+  ///
+  /// \param LHSExprs List of LHS in \a Data.ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a Data.ReductionOps reduction operations.
+  /// \param Data Additional data for task generation like tiedness, final
+  /// state, list of privates, reductions etc.
+  virtual llvm::Value *emitTaskReductionInit(CodeGenFunction &CGF,
+                                             SourceLocation Loc,
+                                             ArrayRef<const Expr *> LHSExprs,
+                                             ArrayRef<const Expr *> RHSExprs,
+                                             const OMPTaskDataTy &Data);
+
+  /// Required to resolve existing problems in the runtime. Emits threadprivate
+  /// variables to store the size of the VLAs/array sections for
+  /// initializer/combiner/finalizer functions + emits threadprivate variable to
+  /// store the pointer to the original reduction item for the custom
+  /// initializer defined by declare reduction construct.
+  /// \param RCG Allows to reuse an existing data for the reductions.
+  /// \param N Reduction item for which fixups must be emitted.
+  virtual void emitTaskReductionFixups(CodeGenFunction &CGF, SourceLocation Loc,
+                                       ReductionCodeGen &RCG, unsigned N);
+
+  /// Get the address of `void *` type of the privatue copy of the reduction
+  /// item specified by the \p SharedLVal.
+  /// \param ReductionsPtr Pointer to the reduction data returned by the
+  /// emitTaskReductionInit function.
+  /// \param SharedLVal Address of the original reduction item.
+  virtual Address getTaskReductionItem(CodeGenFunction &CGF, SourceLocation Loc,
+                                       llvm::Value *ReductionsPtr,
+                                       LValue SharedLVal);
+
  /// \brief Emit code for 'taskwait' directive.
  virtual void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc);

--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@ -2697,11 +2697,32 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
      ++ID;
    }
  }
+  SmallVector<const Expr *, 4> LHSs;
+  SmallVector<const Expr *, 4> RHSs;
+  for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
+    auto IPriv = C->privates().begin();
+    auto IRed = C->reduction_ops().begin();
+    auto ILHS = C->lhs_exprs().begin();
+    auto IRHS = C->rhs_exprs().begin();
+    for (const auto *Ref : C->varlists()) {
+      Data.ReductionVars.emplace_back(Ref);
+      Data.ReductionCopies.emplace_back(*IPriv);
+      Data.ReductionOps.emplace_back(*IRed);
+      LHSs.emplace_back(*ILHS);
+      RHSs.emplace_back(*IRHS);
+      std::advance(IPriv, 1);
+      std::advance(IRed, 1);
+      std::advance(ILHS, 1);
+      std::advance(IRHS, 1);
+    }
+  }
+  Data.Reductions = CGM.getOpenMPRuntime().emitTaskReductionInit(
+      *this, S.getLocStart(), LHSs, RHSs, Data);
  // Build list of dependences.
  for (const auto *C : S.getClausesOfKind<OMPDependClause>())
    for (auto *IRef : C->varlists())
      Data.Dependences.push_back(std::make_pair(C->getDependencyKind(), IRef));
-  auto &&CodeGen = [&Data, CS, &BodyGen, &LastprivateDstsOrigs](
+  auto &&CodeGen = [&Data, &S, CS, &BodyGen, &LastprivateDstsOrigs](
      CodeGenFunction &CGF, PrePostActionTy &Action) {
    // Set proper addresses for generated private copies.
    OMPPrivateScope Scope(CGF);
@ -2756,6 +2777,34 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
        Scope.addPrivate(Pair.first, [Replacement]() { return Replacement; });
      }
    }
+    if (Data.Reductions) {
+      OMPLexicalScope LexScope(CGF, S, /*AsInlined=*/true);
+      ReductionCodeGen RedCG(Data.ReductionVars, Data.ReductionCopies,
+                             Data.ReductionOps);
+      llvm::Value *ReductionsPtr = CGF.Builder.CreateLoad(
+          CGF.GetAddrOfLocalVar(CS->getCapturedDecl()->getParam(9)));
+      for (unsigned Cnt = 0, E = Data.ReductionVars.size(); Cnt < E; ++Cnt) {
+        RedCG.emitSharedLValue(CGF, Cnt);
+        RedCG.emitAggregateType(CGF, Cnt);
+        Address Replacement = CGF.CGM.getOpenMPRuntime().getTaskReductionItem(
+            CGF, S.getLocStart(), ReductionsPtr, RedCG.getSharedLValue(Cnt));
+        Replacement =
+            Address(CGF.EmitScalarConversion(
+                        Replacement.getPointer(), CGF.getContext().VoidPtrTy,
+                        CGF.getContext().getPointerType(
+                            Data.ReductionCopies[Cnt]->getType()),
+                        SourceLocation()),
+                    Replacement.getAlignment());
+        Replacement = RedCG.adjustPrivateAddress(CGF, Cnt, Replacement);
+        Scope.addPrivate(RedCG.getBaseDecl(Cnt),
+                         [Replacement]() { return Replacement; });
+        // FIXME: This must removed once the runtime library is fixed.
+        // Emit required threadprivate variables for
+        // initilizer/combiner/finalizer.
+        CGF.CGM.getOpenMPRuntime().emitTaskReductionFixups(CGF, S.getLocStart(),
+                                                           RedCG, Cnt);
+      }
+    }
    (void)Scope.Privatize();

    Action.Enter(CGF);
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@ -1807,6 +1807,8 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
        std::make_pair(".lb.", KmpUInt64Ty),
        std::make_pair(".ub.", KmpUInt64Ty), std::make_pair(".st.", KmpInt64Ty),
        std::make_pair(".liter.", KmpInt32Ty),
+        std::make_pair(".reductions.",
+                       Context.VoidPtrTy.withConst().withRestrict()),
        std::make_pair(StringRef(), QualType()) // __context with shared vars
    };
    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@ -701,7 +701,7 @@ int main() {

 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],

-// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: [[ARR_SIZE:%.+]] = udiv exact i64
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],

--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@ -505,7 +505,7 @@ int main() {

 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],

-// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: [[ARR_SIZE:%.+]] = udiv exact i64
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],

--- a/clang/test/OpenMP/taskloop_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_codegen.cpp
@ -9,7 +9,7 @@
 int main(int argc, char **argv) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@ -24,7 +24,7 @@ int main(int argc, char **argv) {
 #pragma omp taskloop priority(argc)
  for (int i = 0; i < 10; ++i)
    ;
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@ -40,7 +40,7 @@ int main(int argc, char **argv) {
  for (int i = 0; i < 10; ++i)
    ;
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
@ -144,7 +144,7 @@ struct S {
  int a;
  S(int c) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
--- a/clang/test/OpenMP/taskloop_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_firstprivate_codegen.cpp
@ -23,7 +23,7 @@ struct S {

 volatile double g;

-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
@ -57,7 +57,7 @@ int main() {
  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
  [&]() {
  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
 // LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@ -101,7 +101,7 @@ int main() {
  // BLOCKS: call void {{%.+}}(i8
  ^{
  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
  // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@ -190,7 +190,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*

 // Fill kmp_task_t->shareds by copying from original capture argument.
--- a/clang/test/OpenMP/taskloop_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_lastprivate_codegen.cpp
@ -23,7 +23,7 @@ struct S {

 volatile double g;

-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
@ -57,7 +57,7 @@ int main() {
  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
  [&]() {
  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1

 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
@ -98,7 +98,7 @@ int main() {
  // BLOCKS: call void {{%.+}}(i8
  ^{
  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
  // BLOCKS: ret
@ -182,7 +182,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*

 // Fill kmp_task_t->shareds by copying from original capture argument.
--- a/clang/test/OpenMP/taskloop_private_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_private_codegen.cpp
@ -23,7 +23,7 @@ struct S {

 volatile double g;

-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
@ -55,7 +55,7 @@ int main() {
  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
  [&]() {
  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
 // LAMBDA: ret
@ -91,7 +91,7 @@ int main() {
  // BLOCKS: call void {{%.+}}(i8
  ^{
  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
  // BLOCKS: ret
@ -157,7 +157,7 @@ int main() {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*

 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
--- a/clang/test/OpenMP/taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
@ -0,0 +1,235 @@
+// RUN: %clang_cc1 -fopenmp -x c++ %s -verify -debug-info-kind=limited -emit-llvm -o - -triple powerpc64le-unknown-linux-gnu | FileCheck %s
+// expected-no-diagnostics
+
+struct S {
+  float a;
+  S() : a(0.0f) {}
+  ~S() {}
+};
+
+#pragma omp declare reduction(+:S:omp_out.a += omp_in.a) initializer(omp_priv = omp_orig)
+
+float g;
+
+int a;
+#pragma omp threadprivate(a)
+int main (int argc, char *argv[])
+{
+int   i, n;
+float a[100], b[100], sum, e[argc + 100];
+S c[100];
+float &d = g;
+
+/* Some initializations */
+n = 100;
+for (i=0; i < n; i++)
+  a[i] = b[i] = i * 1.0;
+sum = 0.0;
+
+#pragma omp taskloop reduction(+:sum, c[:n], d, e)
+  for (i=0; i < n; i++) {
+    sum = sum + (a[i] * b[i]);
+    c[i].a = i*i;
+    d += i*i;
+    e[i] = i;
+  }
+
+}
+
+// CHECK-LABEL: @main(
+// CHECK:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK:    [[ARGV_ADDR:%.*]] = alloca i8**, align 8
+// CHECK:    [[I:%.*]] = alloca i32, align 4
+// CHECK:    [[N:%.*]] = alloca i32, align 4
+// CHECK:    [[A:%.*]] = alloca [100 x float], align 4
+// CHECK:    [[B:%.*]] = alloca [100 x float], align 4
+// CHECK:    [[SUM:%.*]] = alloca float, align 4
+// CHECK:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK:    [[C:%.*]] = alloca [100 x %struct.S], align 4
+// CHECK:    [[D:%.*]] = alloca float*, align 8
+// CHECK:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%ident_t*
+// CHECK:    [[DOTRD_INPUT_:%.*]] = alloca [4 x %struct.kmp_task_red_input_t], align 8
+// CHECK:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK:    store i32 [[ARGC:%.*]], i32* [[ARGC_ADDR]], align 4
+// CHECK:    store i8** [[ARGV:%.*]], i8*** [[ARGV_ADDR]], align 8
+// CHECK:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
+// CHECK:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 100
+// CHECK:    [[TMP2:%.*]] = zext i32 [[ADD]] to i64
+// CHECK:    [[VLA:%.+]] = alloca float, i64 %
+
+// CHECK:    call void @__kmpc_taskgroup(%ident_t*
+// CHECK:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T:%.*]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 0
+// CHECK:    [[TMP21:%.*]] = bitcast float* [[SUM]] to i8*
+// CHECK:    store i8* [[TMP21]], i8** [[TMP20]], align 8
+// CHECK:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 1
+// CHECK:    store i64 4, i64* [[TMP22]], align 8
+// CHECK:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT1:@.+]] to i8*), i8** [[TMP23]], align 8
+// CHECK:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 3
+// CHECK:    store i8* null, i8** [[TMP24]], align 8
+// CHECK:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB1:@.+]] to i8*), i8** [[TMP25]], align 8
+// CHECK:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
+// CHECK:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
+// CHECK:    call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
+// CHECK:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
+// CHECK:    [[TMP29:%.*]] = load i32, i32* [[N]], align 4
+// CHECK:    [[TMP30:%.*]] = sext i32 [[TMP29]] to i64
+// CHECK:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP30]]
+// CHECK:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK:    [[TMP31:%.*]] = bitcast %struct.S* [[ARRAYIDX5]] to i8*
+// CHECK:    store i8* [[TMP31]], i8** [[TMP28]], align 8
+// CHECK:    [[TMP32:%.*]] = ptrtoint %struct.S* [[ARRAYIDX6]] to i64
+// CHECK:    [[TMP33:%.*]] = ptrtoint %struct.S* [[ARRAYIDX5]] to i64
+// CHECK:    [[TMP34:%.*]] = sub i64 [[TMP32]], [[TMP33]]
+// CHECK:    [[TMP35:%.*]] = sdiv exact i64 [[TMP34]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK:    [[TMP36:%.*]] = add nuw i64 [[TMP35]], 1
+// CHECK:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
+// CHECK:    store i64 [[TMP37]], i64* [[TMP38]], align 8
+// CHECK:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT2:@.+]] to i8*), i8** [[TMP39]], align 8
+// CHECK:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 3
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_FINI2:@.+]] to i8*), i8** [[TMP40]], align 8
+// CHECK:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB2:@.+]] to i8*), i8** [[TMP41]], align 8
+// CHECK:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 5
+// CHECK:    store i32 1, i32* [[TMP42]], align 8
+// CHECK:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
+// CHECK:    [[TMP44:%.*]] = load float*, float** [[D]], align 8
+// CHECK:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to i8*
+// CHECK:    store i8* [[TMP45]], i8** [[TMP43]], align 8
+// CHECK:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
+// CHECK:    store i64 4, i64* [[TMP46]], align 8
+// CHECK:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT3:@.+]] to i8*), i8** [[TMP47]], align 8
+// CHECK:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 3
+// CHECK:    store i8* null, i8** [[TMP48]], align 8
+// CHECK:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB3:@.+]] to i8*), i8** [[TMP49]], align 8
+// CHECK:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
+// CHECK:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
+// CHECK:    call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK:    [[DOTRD_INPUT_GEP_8:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 3
+// CHECK:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 0
+// CHECK:    [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
+// CHECK:    store i8* [[TMP53]], i8** [[TMP52]], align 8
+// CHECK:    [[TMP54:%.*]] = mul nuw i64 [[TMP2]], 4
+// CHECK:    [[TMP55:%.*]] = udiv exact i64 [[TMP54]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 1
+// CHECK:    store i64 [[TMP54]], i64* [[TMP56]], align 8
+// CHECK:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT4:@.+]] to i8*), i8** [[TMP57]], align 8
+// CHECK:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 3
+// CHECK:    store i8* null, i8** [[TMP58]], align 8
+// CHECK:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB4:@.+]] to i8*), i8** [[TMP59]], align 8
+// CHECK:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 5
+// CHECK:    store i32 1, i32* [[TMP60]], align 8
+// CHECK:    [[TMP61:%.*]] = bitcast [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]] to i8*
+// CHECK:    [[TMP62:%.*]] = call i8* @__kmpc_task_reduction_init(i32 [[TMP0]], i32 4, i8* [[TMP61]])
+// CHECK:    [[TMP63:%.*]] = load i32, i32* [[N]], align 4
+// CHECK:    store i32 [[TMP63]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK:    [[TMP64:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK:    [[SUB:%.*]] = sub nsw i32 [[TMP64]], 0
+// CHECK:    [[SUB10:%.*]] = sub nsw i32 [[SUB]], 1
+// CHECK:    [[ADD11:%.*]] = add nsw i32 [[SUB10]], 1
+// CHECK:    [[DIV:%.*]] = sdiv i32 [[ADD11]], 1
+// CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 72, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
+// CHECK:    [[TMP66:%.*]] = bitcast i8* [[TMP65]] to %struct.kmp_task_t_with_privates*
+// CHECK:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP66]], i32 0, i32 0
+// CHECK:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 0
+// CHECK:    [[TMP69:%.*]] = load i8*, i8** [[TMP68]], align 8
+// CHECK:    [[TMP70:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// CHECK:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP69]], i8* [[TMP70]], i64 72, i32 8, i1 false)
+// CHECK:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP66]], i32 0, i32 1
+// CHECK:    [[TMP72:%.*]] = bitcast i8* [[TMP69]] to %struct.anon*
+// CHECK:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 0
+// CHECK:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP72]], i32 0, i32 1
+// CHECK:    [[REF:%.*]] = load i32*, i32** [[TMP74]], align 8
+// CHECK:    [[TMP75:%.*]] = load i32, i32* [[REF]], align 4
+// CHECK:    store i32 [[TMP75]], i32* [[TMP73]], align 8
+// CHECK:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 1
+// CHECK:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP72]], i32 0, i32 3
+// CHECK:    [[REF13:%.*]] = load [100 x float]*, [100 x float]** [[TMP77]], align 8
+// CHECK:    [[TMP78:%.*]] = bitcast [100 x float]* [[TMP76]] to i8*
+// CHECK:    [[TMP79:%.*]] = bitcast [100 x float]* [[REF13]] to i8*
+// CHECK:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP78]], i8* [[TMP79]], i64 400, i32 4, i1 false)
+// CHECK:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 2
+// CHECK:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP72]], i32 0, i32 4
+// CHECK:    [[REF14:%.*]] = load [100 x float]*, [100 x float]** [[TMP81]], align 8
+// CHECK:    [[TMP82:%.*]] = bitcast [100 x float]* [[TMP80]] to i8*
+// CHECK:    [[TMP83:%.*]] = bitcast [100 x float]* [[REF14]] to i8*
+// CHECK:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP82]], i8* [[TMP83]], i64 400, i32 4, i1 false)
+// CHECK:    [[TMP84:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 5
+// CHECK:    store i64 0, i64* [[TMP84]], align 8
+// CHECK:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 6
+// CHECK:    [[TMP86:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK:    [[CONV15:%.*]] = sext i32 [[TMP86]] to i64
+// CHECK:    store i64 [[CONV15]], i64* [[TMP85]], align 8
+// CHECK:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 7
+// CHECK:    store i64 1, i64* [[TMP87]], align 8
+// CHECK:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 9
+// CHECK:    store i8* [[TMP62]], i8** [[TMP88]], align 8
+// CHECK:    [[TMP89:%.*]] = load i64, i64* [[TMP87]], align 8
+// CHECK:    call void @__kmpc_taskloop(%ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* [[TMP84]], i64* [[TMP85]], i64 [[TMP89]], i32 0, i32 0, i64 0, i8* null)
+// CHECK:    call void @__kmpc_end_taskgroup(%ident_t*
+
+// CHECK:    ret i32
+
+// CHECK: define internal void [[RED_INIT1]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB1]](i8*, i8*)
+// CHECK: fadd float %6, %7
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_FINI2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB2]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT3]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB3]](i8*, i8*)
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT4]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB4]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
--- a/clang/test/OpenMP/taskloop_simd_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_codegen.cpp
@ -9,7 +9,7 @@
 int main(int argc, char **argv) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@ -24,7 +24,7 @@ int main(int argc, char **argv) {
 #pragma omp taskloop simd priority(argc)
  for (int i = 0; i < 10; ++i)
    ;
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@ -40,7 +40,7 @@ int main(int argc, char **argv) {
  for (int i = 0; i < 10; ++i)
    ;
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
@ -146,7 +146,7 @@ struct S {
  int a;
  S(int c) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
--- a/clang/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
@ -23,7 +23,7 @@ struct S {

 volatile double g;

-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
@ -57,7 +57,7 @@ int main() {
  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
  [&]() {
  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
 // LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@ -101,7 +101,7 @@ int main() {
  // BLOCKS: call void {{%.+}}(i8
  ^{
  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
  // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@ -190,7 +190,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*

 // Fill kmp_task_t->shareds by copying from original capture argument.
--- a/clang/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
@ -23,7 +23,7 @@ struct S {

 volatile double g;

-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
@ -57,7 +57,7 @@ int main() {
  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
  [&]() {
  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1

 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
@ -98,7 +98,7 @@ int main() {
  // BLOCKS: call void {{%.+}}(i8
  ^{
  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
  // BLOCKS: ret
@ -182,7 +182,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*

 // Fill kmp_task_t->shareds by copying from original capture argument.
--- a/clang/test/OpenMP/taskloop_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_private_codegen.cpp
@ -23,7 +23,7 @@ struct S {

 volatile double g;

-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
@ -55,7 +55,7 @@ int main() {
  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
  [&]() {
  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
 // LAMBDA: ret
@ -91,7 +91,7 @@ int main() {
  // BLOCKS: call void {{%.+}}(i8
  ^{
  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
  // BLOCKS: ret
@ -157,7 +157,7 @@ int main() {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*

 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
--- a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
@ -0,0 +1,235 @@
+// RUN: %clang_cc1 -fopenmp -x c++ %s -verify -debug-info-kind=limited -emit-llvm -o - -triple powerpc64le-unknown-linux-gnu | FileCheck %s
+// expected-no-diagnostics
+
+struct S {
+  float a;
+  S() : a(0.0f) {}
+  ~S() {}
+};
+
+#pragma omp declare reduction(+:S:omp_out.a += omp_in.a) initializer(omp_priv = omp_orig)
+
+float g;
+
+int a;
+#pragma omp threadprivate(a)
+int main (int argc, char *argv[])
+{
+int   i, n;
+float a[100], b[100], sum, e[argc + 100];
+S c[100];
+float &d = g;
+
+/* Some initializations */
+n = 100;
+for (i=0; i < n; i++)
+  a[i] = b[i] = i * 1.0;
+sum = 0.0;
+
+#pragma omp taskloop simd reduction(+:sum, c[:n], d, e)
+  for (i=0; i < n; i++) {
+    sum = sum + (a[i] * b[i]);
+    c[i].a = i*i;
+    d += i*i;
+    e[i] = i;
+  }
+
+}
+
+// CHECK-LABEL: @main(
+// CHECK:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK:    [[ARGV_ADDR:%.*]] = alloca i8**, align 8
+// CHECK:    [[I:%.*]] = alloca i32, align 4
+// CHECK:    [[N:%.*]] = alloca i32, align 4
+// CHECK:    [[A:%.*]] = alloca [100 x float], align 4
+// CHECK:    [[B:%.*]] = alloca [100 x float], align 4
+// CHECK:    [[SUM:%.*]] = alloca float, align 4
+// CHECK:    [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK:    [[C:%.*]] = alloca [100 x %struct.S], align 4
+// CHECK:    [[D:%.*]] = alloca float*, align 8
+// CHECK:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%ident_t*
+// CHECK:    [[DOTRD_INPUT_:%.*]] = alloca [4 x %struct.kmp_task_red_input_t], align 8
+// CHECK:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK:    store i32 [[ARGC:%.*]], i32* [[ARGC_ADDR]], align 4
+// CHECK:    store i8** [[ARGV:%.*]], i8*** [[ARGV_ADDR]], align 8
+// CHECK:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4
+// CHECK:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 100
+// CHECK:    [[TMP2:%.*]] = zext i32 [[ADD]] to i64
+// CHECK:    [[VLA:%.+]] = alloca float, i64 %
+
+// CHECK:    call void @__kmpc_taskgroup(%ident_t*
+// CHECK:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T:%.*]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 0
+// CHECK:    [[TMP21:%.*]] = bitcast float* [[SUM]] to i8*
+// CHECK:    store i8* [[TMP21]], i8** [[TMP20]], align 8
+// CHECK:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 1
+// CHECK:    store i64 4, i64* [[TMP22]], align 8
+// CHECK:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT1:@.+]] to i8*), i8** [[TMP23]], align 8
+// CHECK:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 3
+// CHECK:    store i8* null, i8** [[TMP24]], align 8
+// CHECK:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB1:@.+]] to i8*), i8** [[TMP25]], align 8
+// CHECK:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
+// CHECK:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
+// CHECK:    call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
+// CHECK:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
+// CHECK:    [[TMP29:%.*]] = load i32, i32* [[N]], align 4
+// CHECK:    [[TMP30:%.*]] = sext i32 [[TMP29]] to i64
+// CHECK:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP30]]
+// CHECK:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK:    [[TMP31:%.*]] = bitcast %struct.S* [[ARRAYIDX5]] to i8*
+// CHECK:    store i8* [[TMP31]], i8** [[TMP28]], align 8
+// CHECK:    [[TMP32:%.*]] = ptrtoint %struct.S* [[ARRAYIDX6]] to i64
+// CHECK:    [[TMP33:%.*]] = ptrtoint %struct.S* [[ARRAYIDX5]] to i64
+// CHECK:    [[TMP34:%.*]] = sub i64 [[TMP32]], [[TMP33]]
+// CHECK:    [[TMP35:%.*]] = sdiv exact i64 [[TMP34]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK:    [[TMP36:%.*]] = add nuw i64 [[TMP35]], 1
+// CHECK:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
+// CHECK:    store i64 [[TMP37]], i64* [[TMP38]], align 8
+// CHECK:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT2:@.+]] to i8*), i8** [[TMP39]], align 8
+// CHECK:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 3
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_FINI2:@.+]] to i8*), i8** [[TMP40]], align 8
+// CHECK:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB2:@.+]] to i8*), i8** [[TMP41]], align 8
+// CHECK:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 5
+// CHECK:    store i32 1, i32* [[TMP42]], align 8
+// CHECK:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
+// CHECK:    [[TMP44:%.*]] = load float*, float** [[D]], align 8
+// CHECK:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to i8*
+// CHECK:    store i8* [[TMP45]], i8** [[TMP43]], align 8
+// CHECK:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
+// CHECK:    store i64 4, i64* [[TMP46]], align 8
+// CHECK:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT3:@.+]] to i8*), i8** [[TMP47]], align 8
+// CHECK:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 3
+// CHECK:    store i8* null, i8** [[TMP48]], align 8
+// CHECK:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB3:@.+]] to i8*), i8** [[TMP49]], align 8
+// CHECK:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
+// CHECK:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
+// CHECK:    call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK:    [[DOTRD_INPUT_GEP_8:%.*]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64 3
+// CHECK:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 0
+// CHECK:    [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
+// CHECK:    store i8* [[TMP53]], i8** [[TMP52]], align 8
+// CHECK:    [[TMP54:%.*]] = mul nuw i64 [[TMP2]], 4
+// CHECK:    [[TMP55:%.*]] = udiv exact i64 [[TMP54]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 1
+// CHECK:    store i64 [[TMP54]], i64* [[TMP56]], align 8
+// CHECK:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 2
+// CHECK:    store i8* bitcast (void (i8*)* [[RED_INIT4:@.+]] to i8*), i8** [[TMP57]], align 8
+// CHECK:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 3
+// CHECK:    store i8* null, i8** [[TMP58]], align 8
+// CHECK:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 4
+// CHECK:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB4:@.+]] to i8*), i8** [[TMP59]], align 8
+// CHECK:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 5
+// CHECK:    store i32 1, i32* [[TMP60]], align 8
+// CHECK:    [[TMP61:%.*]] = bitcast [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]] to i8*
+// CHECK:    [[TMP62:%.*]] = call i8* @__kmpc_task_reduction_init(i32 [[TMP0]], i32 4, i8* [[TMP61]])
+// CHECK:    [[TMP63:%.*]] = load i32, i32* [[N]], align 4
+// CHECK:    store i32 [[TMP63]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK:    [[TMP64:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK:    [[SUB:%.*]] = sub nsw i32 [[TMP64]], 0
+// CHECK:    [[SUB10:%.*]] = sub nsw i32 [[SUB]], 1
+// CHECK:    [[ADD11:%.*]] = add nsw i32 [[SUB10]], 1
+// CHECK:    [[DIV:%.*]] = sdiv i32 [[ADD11]], 1
+// CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 72, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
+// CHECK:    [[TMP66:%.*]] = bitcast i8* [[TMP65]] to %struct.kmp_task_t_with_privates*
+// CHECK:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP66]], i32 0, i32 0
+// CHECK:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 0
+// CHECK:    [[TMP69:%.*]] = load i8*, i8** [[TMP68]], align 8
+// CHECK:    [[TMP70:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// CHECK:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP69]], i8* [[TMP70]], i64 72, i32 8, i1 false)
+// CHECK:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], %struct.kmp_task_t_with_privates* [[TMP66]], i32 0, i32 1
+// CHECK:    [[TMP72:%.*]] = bitcast i8* [[TMP69]] to %struct.anon*
+// CHECK:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 0
+// CHECK:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP72]], i32 0, i32 1
+// CHECK:    [[REF:%.*]] = load i32*, i32** [[TMP74]], align 8
+// CHECK:    [[TMP75:%.*]] = load i32, i32* [[REF]], align 4
+// CHECK:    store i32 [[TMP75]], i32* [[TMP73]], align 8
+// CHECK:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 1
+// CHECK:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP72]], i32 0, i32 3
+// CHECK:    [[REF13:%.*]] = load [100 x float]*, [100 x float]** [[TMP77]], align 8
+// CHECK:    [[TMP78:%.*]] = bitcast [100 x float]* [[TMP76]] to i8*
+// CHECK:    [[TMP79:%.*]] = bitcast [100 x float]* [[REF13]] to i8*
+// CHECK:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP78]], i8* [[TMP79]], i64 400, i32 4, i1 false)
+// CHECK:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 2
+// CHECK:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP72]], i32 0, i32 4
+// CHECK:    [[REF14:%.*]] = load [100 x float]*, [100 x float]** [[TMP81]], align 8
+// CHECK:    [[TMP82:%.*]] = bitcast [100 x float]* [[TMP80]] to i8*
+// CHECK:    [[TMP83:%.*]] = bitcast [100 x float]* [[REF14]] to i8*
+// CHECK:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP82]], i8* [[TMP83]], i64 400, i32 4, i1 false)
+// CHECK:    [[TMP84:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 5
+// CHECK:    store i64 0, i64* [[TMP84]], align 8
+// CHECK:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 6
+// CHECK:    [[TMP86:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK:    [[CONV15:%.*]] = sext i32 [[TMP86]] to i64
+// CHECK:    store i64 [[CONV15]], i64* [[TMP85]], align 8
+// CHECK:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 7
+// CHECK:    store i64 1, i64* [[TMP87]], align 8
+// CHECK:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 9
+// CHECK:    store i8* [[TMP62]], i8** [[TMP88]], align 8
+// CHECK:    [[TMP89:%.*]] = load i64, i64* [[TMP87]], align 8
+// CHECK:    call void @__kmpc_taskloop(%ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* [[TMP84]], i64* [[TMP85]], i64 [[TMP89]], i32 0, i32 0, i64 0, i8* null)
+// CHECK:    call void @__kmpc_end_taskgroup(%ident_t*
+
+// CHECK:    ret i32
+
+// CHECK: define internal void [[RED_INIT1]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB1]](i8*, i8*)
+// CHECK: fadd float %6, %7
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_FINI2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB2]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT3]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB3]](i8*, i8*)
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT4]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB4]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+