From 4cb5461daed8ee14f7317cc4c48171b251e18d97 Mon Sep 17 00:00:00 2001 From: Tobias Grosser Date: Thu, 12 Apr 2012 10:46:55 +0000 Subject: [PATCH] CodeGen: Generate scalar code if vector instructions cannot be generated This fixes two crashes that appeared in case of: - A load of a non vectorizable type (e.g. float**) - An instruction that is not vectorizable (e.g. call) llvm-svn: 154586 --- polly/lib/CodeGen/CodeGeneration.cpp | 75 +++++++++++++++++++-- polly/test/CodeGen/simple_vec_call.ll | 43 ++++++++++++ polly/test/CodeGen/simple_vec_call_2.ll | 45 +++++++++++++ polly/test/CodeGen/simple_vec_impossible.ll | 38 +++++++++++ 4 files changed, 194 insertions(+), 7 deletions(-) create mode 100644 polly/test/CodeGen/simple_vec_call.ll create mode 100644 polly/test/CodeGen/simple_vec_call_2.ll create mode 100644 polly/test/CodeGen/simple_vec_impossible.ll diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp index 903ed382b78b..e0fc8840774a 100644 --- a/polly/lib/CodeGen/CodeGeneration.cpp +++ b/polly/lib/CodeGen/CodeGeneration.cpp @@ -569,6 +569,12 @@ private: void copyStore(const StoreInst *Store, ValueMapT &VectorMap, VectorValueMapT &ScalarMaps); + void copyInstScalarized(const Instruction *Inst, ValueMapT &VectorMap, + VectorValueMapT &ScalarMaps); + + bool extractScalarValues(const Instruction *Inst, ValueMapT &VectorMap, + VectorValueMapT &ScalarMaps); + bool hasVectorOperands(const Instruction *Inst, ValueMapT &VectorMap); void copyInstruction(const Instruction *Inst, ValueMapT &VectorMap, @@ -680,18 +686,16 @@ Value *VectorBlockGenerator::generateUnknownStrideLoad(const LoadInst *Load, void VectorBlockGenerator::generateLoad(const LoadInst *Load, ValueMapT &VectorMap, VectorValueMapT &ScalarMaps) { - Value *NewLoad; - - if (GroupedUnrolling) { + if (GroupedUnrolling || !VectorType::isValidElementType(Load->getType())) { for (int i = 0; i < getVectorWidth(); i++) ScalarMaps[i][Load] = generateScalarLoad(Load, ScalarMaps[i], GlobalMaps[i]); - return; } MemoryAccess &Access = Statement.getAccessFor(Load); + Value *NewLoad; if (Access.isStrideZero(isl_set_copy(Domain))) NewLoad = generateStrideZeroLoad(Load, ScalarMaps[0]); else if (Access.isStrideOne(isl_set_copy(Domain))) @@ -772,6 +776,63 @@ bool VectorBlockGenerator::hasVectorOperands(const Instruction *Inst, return false; } +bool VectorBlockGenerator::extractScalarValues(const Instruction *Inst, + ValueMapT &VectorMap, + VectorValueMapT &ScalarMaps) { + bool HasVectorOperand = false; + int VectorWidth = getVectorWidth(); + + for (Instruction::const_op_iterator OI = Inst->op_begin(), + OE = Inst->op_end(); OI != OE; ++OI) { + ValueMapT::iterator VecOp = VectorMap.find(*OI); + + if (VecOp == VectorMap.end()) + continue; + + HasVectorOperand = true; + Value *NewVector = VecOp->second; + + for (int i = 0; i < VectorWidth; ++i) { + ValueMapT &SM = ScalarMaps[i]; + + // If there is one scalar extracted, all scalar elements should have + // already been extracted by the code here. So no need to check for the + // existance of all of them. + if (SM.count(*OI)) + break; + + SM[*OI] = Builder.CreateExtractElement(NewVector, Builder.getInt32(i)); + } + } + + return HasVectorOperand; +} + +void VectorBlockGenerator::copyInstScalarized(const Instruction *Inst, + ValueMapT &VectorMap, + VectorValueMapT &ScalarMaps) { + bool HasVectorOperand; + int VectorWidth = getVectorWidth(); + + HasVectorOperand = extractScalarValues(Inst, VectorMap, ScalarMaps); + + for (int VectorLane = 0; VectorLane < getVectorWidth(); VectorLane++) + copyInstScalar(Inst, ScalarMaps[VectorLane], GlobalMaps[VectorLane]); + + if (!VectorType::isValidElementType(Inst->getType()) || !HasVectorOperand) + return; + + // Make the result available as vector value. + VectorType *VectorType = VectorType::get(Inst->getType(), VectorWidth); + Value *Vector = UndefValue::get(VectorType); + + for (int i = 0; i < VectorWidth; i++) + Vector = Builder.CreateInsertElement(Vector, ScalarMaps[i][Inst], + Builder.getInt32(i)); + + VectorMap[Inst] = Vector; +} + int VectorBlockGenerator::getVectorWidth() { return GlobalMaps.size(); } @@ -805,11 +866,11 @@ void VectorBlockGenerator::copyInstruction(const Instruction *Inst, return; } - llvm_unreachable("Cannot issue vector code for this instruction"); + // Falltrough: We generate scalar instructions, if we don't know how to + // generate vector code. } - for (int VectorLane = 0; VectorLane < getVectorWidth(); VectorLane++) - copyInstScalar(Inst, ScalarMaps[VectorLane], GlobalMaps[VectorLane]); + copyInstScalarized(Inst, VectorMap, ScalarMaps); } void VectorBlockGenerator::copyBB() { diff --git a/polly/test/CodeGen/simple_vec_call.ll b/polly/test/CodeGen/simple_vec_call.ll new file mode 100644 index 000000000000..af340d427c74 --- /dev/null +++ b/polly/test/CodeGen/simple_vec_call.ll @@ -0,0 +1,43 @@ +; RUN: opt %loadPolly -basicaa -polly-codegen -enable-polly-vector -S %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [1024 x float] zeroinitializer, align 16 +@B = common global [1024 x float] zeroinitializer, align 16 + +declare float @foo(float) readnone + +define void @simple_vec_call() nounwind { +entry: + br label %body + +body: + %indvar = phi i64 [ 0, %entry ], [ %indvar_next, %body ] + %scevgep = getelementptr [1024 x float]* @B, i64 0, i64 %indvar + %value = load float* getelementptr inbounds ([1024 x float]* @A, i64 0, i64 0), align 16 + %result = tail call float @foo(float %value) nounwind + store float %result, float* %scevgep, align 4 + %indvar_next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar_next, 4 + br i1 %exitcond, label %return, label %body + +return: + ret void +} + +; CHECK: %value_p_splat_one = load <1 x float>* bitcast ([1024 x float]* @A to <1 x float>*), align 8 +; CHECK: %value_p_splat = shufflevector <1 x float> %value_p_splat_one, <1 x float> %value_p_splat_one, <4 x i32> zeroinitializer +; CHECK: %0 = extractelement <4 x float> %value_p_splat, i32 0 +; CHECK: %1 = extractelement <4 x float> %value_p_splat, i32 1 +; CHECK: %2 = extractelement <4 x float> %value_p_splat, i32 2 +; CHECK: %3 = extractelement <4 x float> %value_p_splat, i32 3 +; CHECK: %p_result = tail call float @foo(float %0) nounwind +; CHECK: %p_result4 = tail call float @foo(float %1) nounwind +; CHECK: %p_result5 = tail call float @foo(float %2) nounwind +; CHECK: %p_result6 = tail call float @foo(float %3) nounwind +; CHECK: %4 = insertelement <4 x float> undef, float %p_result, i32 0 +; CHECK: %5 = insertelement <4 x float> %4, float %p_result4, i32 1 +; CHECK: %6 = insertelement <4 x float> %5, float %p_result5, i32 2 +; CHECK: %7 = insertelement <4 x float> %6, float %p_result6, i32 3 +; CHECK: %vector_ptr = bitcast float* %p_scevgep to <4 x float>* +; CHECK: store <4 x float> %7, <4 x float>* %vector_ptr, align 8 diff --git a/polly/test/CodeGen/simple_vec_call_2.ll b/polly/test/CodeGen/simple_vec_call_2.ll new file mode 100644 index 000000000000..7b8b59817d58 --- /dev/null +++ b/polly/test/CodeGen/simple_vec_call_2.ll @@ -0,0 +1,45 @@ +; RUN: opt %loadPolly -basicaa -polly-codegen -enable-polly-vector -dce -S %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [1024 x float] zeroinitializer, align 16 +@B = common global [1024 x float**] zeroinitializer, align 16 + +declare float** @foo(float) readnone + +define void @simple_vec_call() nounwind { +entry: + br label %body + +body: + %indvar = phi i64 [ 0, %entry ], [ %indvar_next, %body ] + %scevgep = getelementptr [1024 x float**]* @B, i64 0, i64 %indvar + %value = load float* getelementptr inbounds ([1024 x float]* @A, i64 0, i64 0), align 16 + %result = tail call float** @foo(float %value) nounwind + store float** %result, float*** %scevgep, align 4 + %indvar_next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar_next, 4 + br i1 %exitcond, label %return, label %body + +return: + ret void +} + +; CHECK: %p_scevgep = getelementptr [1024 x float**]* @B, i64 0, i64 0 +; CHECK: %p_scevgep1 = getelementptr [1024 x float**]* @B, i64 0, i64 1 +; CHECK: %p_scevgep2 = getelementptr [1024 x float**]* @B, i64 0, i64 2 +; CHECK: %p_scevgep3 = getelementptr [1024 x float**]* @B, i64 0, i64 3 +; CHECK: %value_p_splat_one = load <1 x float>* bitcast ([1024 x float]* @A to <1 x float>*), align 8 +; CHECK: %value_p_splat = shufflevector <1 x float> %value_p_splat_one, <1 x float> %value_p_splat_one, <4 x i32> zeroinitializer +; CHECK: %0 = extractelement <4 x float> %value_p_splat, i32 0 +; CHECK: %1 = extractelement <4 x float> %value_p_splat, i32 1 +; CHECK: %2 = extractelement <4 x float> %value_p_splat, i32 2 +; CHECK: %3 = extractelement <4 x float> %value_p_splat, i32 3 +; CHECK: %p_result = tail call float** @foo(float %0) nounwind +; CHECK: %p_result4 = tail call float** @foo(float %1) nounwind +; CHECK: %p_result5 = tail call float** @foo(float %2) nounwind +; CHECK: %p_result6 = tail call float** @foo(float %3) nounwind +; CHECK: store float** %p_result, float*** %p_scevgep, align 4 +; CHECK: store float** %p_result4, float*** %p_scevgep1, align 4 +; CHECK: store float** %p_result5, float*** %p_scevgep2, align 4 +; CHECK: store float** %p_result6, float*** %p_scevgep3, align 4 diff --git a/polly/test/CodeGen/simple_vec_impossible.ll b/polly/test/CodeGen/simple_vec_impossible.ll new file mode 100644 index 000000000000..7b59a50fe07a --- /dev/null +++ b/polly/test/CodeGen/simple_vec_impossible.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -basicaa -polly-codegen -enable-polly-vector -S %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [1024 x float**] zeroinitializer, align 16 +@B = common global [1024 x float**] zeroinitializer, align 16 + +declare float @foo(float) readnone + +define void @simple_vec_call() nounwind { +entry: + br label %body + +body: + %indvar = phi i64 [ 0, %entry ], [ %indvar_next, %body ] + %scevgep = getelementptr [1024 x float**]* @B, i64 0, i64 %indvar + %value = load float*** getelementptr inbounds ([1024 x float**]* @A, i64 0, i64 0), align 16 + store float** %value, float*** %scevgep, align 4 + %indvar_next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar_next, 4 + br i1 %exitcond, label %return, label %body + +return: + ret void +} + +; CHECK: %p_scevgep = getelementptr [1024 x float**]* @B, i64 0, i64 0 +; CHECK: %p_scevgep1 = getelementptr [1024 x float**]* @B, i64 0, i64 1 +; CHECK: %p_scevgep2 = getelementptr [1024 x float**]* @B, i64 0, i64 2 +; CHECK: %p_scevgep3 = getelementptr [1024 x float**]* @B, i64 0, i64 3 +; CHECK: %value_p_scalar_ = load float*** getelementptr inbounds ([1024 x float**]* @A, i64 0, i64 0) +; CHECK: %value_p_scalar_4 = load float*** getelementptr inbounds ([1024 x float**]* @A, i64 0, i64 0) +; CHECK: %value_p_scalar_5 = load float*** getelementptr inbounds ([1024 x float**]* @A, i64 0, i64 0) +; CHECK: %value_p_scalar_6 = load float*** getelementptr inbounds ([1024 x float**]* @A, i64 0, i64 0) +; CHECK: store float** %value_p_scalar_, float*** %p_scevgep, align 4 +; CHECK: store float** %value_p_scalar_4, float*** %p_scevgep1, align 4 +; CHECK: store float** %value_p_scalar_5, float*** %p_scevgep2, align 4 +; CHECK: store float** %value_p_scalar_6, float*** %p_scevgep3, align 4