From d96e427eacdd4d48da3a1f93a64008ef4dadcc8a Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Sun, 5 May 2013 01:54:48 +0000
Subject: [PATCH] LoopVectorize: Add support for floating point min/max
 reductions

Add support for min/max reductions when "no-nans-float-math" is enabled. This
allows us to assume we have ordered floating point math and treat ordered and
unordered predicates equally.

radar://13723044

llvm-svn: 181144
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  91 +++-
 .../LoopVectorize/minmax_reduction.ll         | 480 ++++++++++++++++++
 2 files changed, 549 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9f9058755144..c7ff7c3fceb1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -335,7 +335,7 @@ public:
                             DominatorTree *DT, TargetTransformInfo* TTI,
                             AliasAnalysis *AA, TargetLibraryInfo *TLI)
       : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
-        Induction(0) {}
+        Induction(0), HasFunNoNaNAttr(false) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -347,7 +347,8 @@ public:
     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
-    RK_FloatMult    ///< Product of floats.
+    RK_FloatMult,   ///< Product of floats.
+    RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
   };
 
   /// This enum represents the kinds of inductions that we support.
@@ -365,7 +366,9 @@ public:
     MRK_UIntMin,
     MRK_UIntMax,
     MRK_SIntMin,
-    MRK_SIntMax
+    MRK_SIntMax,
+    MRK_FloatMin,
+    MRK_FloatMax
   };
 
   /// This POD struct holds information about reduction variables.
@@ -586,6 +589,8 @@ private:
   /// We need to check that all of the pointers in this list are disjoint
   /// at runtime.
   RuntimePointerCheck PtrRtCheck;
+  /// Can we assume the absence of NaNs.
+  bool HasFunNoNaNAttr;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1648,6 +1653,8 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
       return Instruction::FAdd;
     case LoopVectorizationLegality::RK_IntegerMinMax:
       return Instruction::ICmp;
+    case LoopVectorizationLegality::RK_FloatMinMax:
+      return Instruction::FCmp;
     default:
       llvm_unreachable("Unknown reduction operation");
   }
@@ -1672,8 +1679,21 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
     break;
   case LoopVectorizationLegality::MRK_SIntMax:
     P = CmpInst::ICMP_SGT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMin:
+    P = CmpInst::FCMP_OLT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMax:
+    P = CmpInst::FCMP_OGT;
+    break;
   }
-  Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
+  Value *Cmp;
+  if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
+    Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
+  else
+    Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;
 }
@@ -1743,11 +1763,12 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // one for multiplication, -1 for And.
     Value *Identity;
     Value *VectorStart;
-    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax)
+    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
+        RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
       // MinMax reduction have the start value as their identify.
       VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
                                                          "minmax.ident");
-    else {
+    } else {
       Constant *Iden =
         LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
                                                         VecTy->getScalarType());
@@ -1801,7 +1822,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = getReductionBinOp(RdxDesc.Kind);
     for (unsigned part = 1; part < UF; ++part) {
-      if (Op != Instruction::ICmp)
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
         ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
                                              RdxParts[part], ReducedPartRdx,
                                              "bin.rdx");
@@ -1832,7 +1853,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      if (Op != Instruction::ICmp)
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
         TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                      "bin.rdx");
       else
@@ -2363,6 +2384,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     return false;
   }
 
+  // Look for the attribute signaling the absence of NaNs.
+  Function &F = *Header->getParent();
+  if (F.hasFnAttribute("no-nans-fp-math"))
+    HasFunNoNaNAttr = F.getAttributes().getAttribute(
+      AttributeSet::FunctionIndex,
+      "no-nans-fp-math").getValueAsString() == "true";
+
   // For each block in the loop.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2444,6 +2472,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_FloatMinMax)) {
+          DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
 
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
@@ -2869,7 +2901,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // such that we don't stop when we see the phi has two uses (one by the select
   // and one by the icmp) and to make sure we only see exactly the two
   // instructions.
-  unsigned NumICmpSelectPatternInst = 0;
+  unsigned NumCmpSelectPatternInst = 0;
   ReductionInstDesc ReduxDesc(false, 0);
 
   // Avoid cycles in the chain.
@@ -2918,7 +2950,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 
       // We can't have multiple inside users except for a combination of
       // icmp/select both using the phi.
-      if (FoundInBlockUser && !NumICmpSelectPatternInst)
+      if (FoundInBlockUser && !NumCmpSelectPatternInst)
         return false;
       FoundInBlockUser = true;
 
@@ -2927,14 +2959,15 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       if (!ReduxDesc.IsReduction)
         return false;
 
-      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) ||
-                                       isa<SelectInst>(U)))
-          ++NumICmpSelectPatternInst;
+      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U)))
+          ++NumCmpSelectPatternInst;
+      if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U)))
+          ++NumCmpSelectPatternInst;
 
       // Reductions of instructions such as Div, and Sub is only
       // possible if the LHS is the reduction variable.
       if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
-          !isa<ICmpInst>(U) && U->getOperand(0) != Iter)
+          !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter)
         return false;
 
       Iter = ReduxDesc.PatternLastInst;
@@ -2942,7 +2975,8 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 
     // This means we have seen one but not the other instruction of the
     // pattern or more than just a select and cmp.
-    if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2)
+    if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+        NumCmpSelectPatternInst != 2)
       return false;
 
     // We found a reduction var if we have reached the original
@@ -2968,16 +3002,17 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
 /// pattern corresponding to a min(X, Y) or max(X, Y).
 LoopVectorizationLegality::ReductionInstDesc
-LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) {
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
+                                                    ReductionInstDesc &Prev) {
 
-  assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
+  assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
          "Expect a select instruction");
-  ICmpInst *Cmp = 0;
+  Instruction *Cmp = 0;
   SelectInst *Select = 0;
 
   // We must handle the select(cmp()) as a single instruction. Advance to the
   // select.
-  if ((Cmp = dyn_cast<ICmpInst>(I))) {
+  if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
     if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
       return ReductionInstDesc(false, I);
     return ReductionInstDesc(Select, Prev.MinMaxKind);
@@ -2986,7 +3021,8 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns
   // Only handle single use cases for now.
   if (!(Select = dyn_cast<SelectInst>(I)))
     return ReductionInstDesc(false, I);
-  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
+  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
+      !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
     return ReductionInstDesc(false, I);
   if (!Cmp->hasOneUse())
     return ReductionInstDesc(false, I);
@@ -3003,6 +3039,14 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns
     return ReductionInstDesc(Select, MRK_SIntMax);
   else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
     return ReductionInstDesc(Select, MRK_SIntMin);
+  else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
+  else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
 
   return ReductionInstDesc(false, I);
 }
@@ -3017,7 +3061,8 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
   default:
     return ReductionInstDesc(false, I);
   case Instruction::PHI:
-      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
+      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
+                 Kind != RK_FloatMinMax))
         return ReductionInstDesc(false, I);
     return ReductionInstDesc(I, Prev.MinMaxKind);
   case Instruction::Sub:
@@ -3035,9 +3080,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
   case Instruction::FAdd:
     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
-    if (Kind != RK_IntegerMinMax)
+    if (Kind != RK_IntegerMinMax &&
+        (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
       return ReductionInstDesc(false, I);
     return isMinMaxSelectCmpPattern(I, Prev);
   }
diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
index 36a8758e2cc4..502fd8b9383b 100644
--- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -3,6 +3,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 @A = common global [1024 x i32] zeroinitializer, align 16
+@fA = common global [1024 x float] zeroinitializer, align 16
+@dA = common global [1024 x double] zeroinitializer, align 16
 
 ; Signed tests.
 
@@ -403,3 +405,481 @@ for.body:
 for.end:
   ret i32 %max.red.0
 }
+
+; Float tests.
+
+; Maximum.
+
+; Turn this into a max reduction in the presence of a no-nans-fp-math attribute.
+; CHECK: @max_red_float
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @max_red_float_ge
+; CHECK: fcmp oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp oge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_max_red_float
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp olt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_max_red_float_le
+; CHECK: fcmp ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ole float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @unordered_max_red
+; CHECK: fcmp ugt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ugt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @unordered_max_red_float_ge
+; CHECK: fcmp uge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp uge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_unordered_max_red
+; CHECK: fcmp ult <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ult float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_unordered_max_red_float_le
+; CHECK: fcmp ule <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ule float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; Minimum.
+
+; Turn this into a min reduction in the presence of a no-nans-fp-math attribute.
+; CHECK: @min_red_float
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp olt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @min_red_float_le
+; CHECK: fcmp ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ole float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_min_red_float
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ogt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_min_red_float_ge
+; CHECK: fcmp oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp oge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @unordered_min_red
+; CHECK: fcmp ult <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ult float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @unordered_min_red_float_le
+; CHECK: fcmp ule <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ule float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_unordered_min_red
+; CHECK: fcmp ugt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ugt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_unordered_min_red_float_ge
+; CHECK: fcmp uge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp uge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; Make sure we handle doubles, too.
+; CHECK: @min_red_double
+; CHECK: fcmp olt <2 x double>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x double>
+; CHECK: select <2 x i1>
+
+define double @min_red_double(double %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi double [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x double]* @dA, i64 0, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 4
+  %cmp3 = fcmp olt double %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, double %0, double %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret double %min.red.0
+}
+
+
+; Don't this into a max reduction. The no-nans-fp-math attribute is missing
+; CHECK: @max_red_float_nans
+; CHECK-NOT: <2 x float>
+
+define float @max_red_float_nans(float %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+
+attributes #0 = { "no-nans-fp-math"="true" }