From 0c4b230b32ba04494deeabc4415f7248aa455068 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Sun, 19 Oct 2014 19:13:49 +0000
Subject: [PATCH] [complex] Teach the complex math IR gen to emit direct math
 and a NaN-test prior to the call to the library function.

This should automatically make fastmath (including just non-NaNs) able to avoid
the expensive libcalls and also open the door to more advanced folding in LLVM
based on the rules for complex math.

Two important notes to remember: first is that this isn't yet a proper
limited range mode, it's still just improving the unlimited range mode.
Also, it isn't really perfecet w.r.t. what an unlimited range mode
should be doing because it isn't quite handling the flags produced by
all the operations in the way desirable for that mode, but then neither
is compiler-rt's libcall. When the compiler-rt libcall is improved to
carefully manage flags, the code emitted here should be improved
correspondingly. And it is still a long-term desirable thing to add
a limited range mode to Clang that would be able to use direct math
without library calls here.

Special thanks to Steve Canon for the careful review on this patch and
teaching me about these issues. =D

Differential Revision: http://reviews.llvm.org/D5756

llvm-svn: 220167
---
 clang/lib/CodeGen/CGExprComplex.cpp | 100 +++++++++++++++++++++++-----
 clang/test/CodeGen/complex-math.c   |  48 +++++++++++--
 2 files changed, 127 insertions(+), 21 deletions(-)
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index b1cc1efa58e5..e957256ab5d7 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -15,9 +15,13 @@
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/StmtVisitor.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include <algorithm>
 using namespace clang;
 using namespace CodeGen;
@@ -587,11 +591,31 @@ ComplexPairTy ComplexExprEmitter::EmitComplexBinOpLibCall(StringRef LibCallName,
   return CGF.EmitCall(FuncInfo, Func, ReturnValueSlot(), Args).getComplexVal();
 }
 
+/// \brief Lookup the libcall name for a given floating point type complex
+/// multiply.
+static StringRef getComplexMultiplyLibCallName(llvm::Type *Ty) {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("Unsupported floating point type!");
+  case llvm::Type::HalfTyID:
+    return "__mulhc3";
+  case llvm::Type::FloatTyID:
+    return "__mulsc3";
+  case llvm::Type::DoubleTyID:
+    return "__muldc3";
+  case llvm::Type::PPC_FP128TyID:
+    return "__multc3";
+  case llvm::Type::X86_FP80TyID:
+    return "__mulxc3";
+  }
+}
+
 // See C11 Annex G.5.1 for the semantics of multiplicative operators on complex
 // typed values.
 ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) {
   using llvm::Value;
   Value *ResR, *ResI;
+  llvm::MDBuilder MDHelper(CGF.getLLVMContext());
 
   if (Op.LHS.first->getType()->isFloatingPointTy()) {
     // The general formulation is:
@@ -603,23 +627,65 @@ ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) {
     // still more of this within the type system.
 
     if (Op.LHS.second && Op.RHS.second) {
-      // If both operands are complex, delegate to a libcall which works to
-      // prevent underflow and overflow.
-      StringRef LibCallName;
-      switch (Op.LHS.first->getType()->getTypeID()) {
-      default:
-        llvm_unreachable("Unsupported floating point type!");
-      case llvm::Type::HalfTyID:
-        return EmitComplexBinOpLibCall("__mulhc3", Op);
-      case llvm::Type::FloatTyID:
-        return EmitComplexBinOpLibCall("__mulsc3", Op);
-      case llvm::Type::DoubleTyID:
-        return EmitComplexBinOpLibCall("__muldc3", Op);
-      case llvm::Type::PPC_FP128TyID:
-        return EmitComplexBinOpLibCall("__multc3", Op);
-      case llvm::Type::X86_FP80TyID:
-        return EmitComplexBinOpLibCall("__mulxc3", Op);
-      }
+      // If both operands are complex, emit the core math directly, and then
+      // test for NaNs. If we find NaNs in the result, we delegate to a libcall
+      // to carefully re-compute the correct infinity representation if
+      // possible. The expectation is that the presence of NaNs here is
+      // *extremely* rare, and so the cost of the libcall is almost irrelevant.
+      // This is good, because the libcall re-computes the core multiplication
+      // exactly the same as we do here and re-tests for NaNs in order to be
+      // a generic complex*complex libcall.
+
+      // First compute the four products.
+      Value *AC = Builder.CreateFMul(Op.LHS.first, Op.RHS.first, "mul_ac");
+      Value *BD = Builder.CreateFMul(Op.LHS.second, Op.RHS.second, "mul_bd");
+      Value *AD = Builder.CreateFMul(Op.LHS.first, Op.RHS.second, "mul_ad");
+      Value *BC = Builder.CreateFMul(Op.LHS.second, Op.RHS.first, "mul_bc");
+
+      // The real part is the difference of the first two, the imaginary part is
+      // the sum of the second.
+      ResR = Builder.CreateFSub(AC, BD, "mul_r");
+      ResI = Builder.CreateFAdd(AD, BC, "mul_i");
+
+      // Emit the test for the real part becoming NaN and create a branch to
+      // handle it. We test for NaN by comparing the number to itself.
+      Value *IsRNaN = Builder.CreateFCmpUNO(ResR, ResR, "isnan_cmp");
+      llvm::BasicBlock *ContBB = CGF.createBasicBlock("complex_mul_cont");
+      llvm::BasicBlock *INaNBB = CGF.createBasicBlock("complex_mul_imag_nan");
+      llvm::Instruction *Branch = Builder.CreateCondBr(IsRNaN, INaNBB, ContBB);
+      llvm::BasicBlock *OrigBB = Branch->getParent();
+
+      // Give hint that we very much don't expect to see NaNs.
+      // Value chosen to match UR_NONTAKEN_WEIGHT, see BranchProbabilityInfo.cpp
+      llvm::MDNode *BrWeight = MDHelper.createBranchWeights(1, (1U << 20) - 1);
+      Branch->setMetadata(llvm::LLVMContext::MD_prof, BrWeight);
+
+      // Now test the imaginary part and create its branch.
+      CGF.EmitBlock(INaNBB);
+      Value *IsINaN = Builder.CreateFCmpUNO(ResI, ResI, "isnan_cmp");
+      llvm::BasicBlock *LibCallBB = CGF.createBasicBlock("complex_mul_libcall");
+      Branch = Builder.CreateCondBr(IsINaN, LibCallBB, ContBB);
+      Branch->setMetadata(llvm::LLVMContext::MD_prof, BrWeight);
+
+      // Now emit the libcall on this slowest of the slow paths.
+      CGF.EmitBlock(LibCallBB);
+      Value *LibCallR, *LibCallI;
+      std::tie(LibCallR, LibCallI) = EmitComplexBinOpLibCall(
+          getComplexMultiplyLibCallName(Op.LHS.first->getType()), Op);
+      Builder.CreateBr(ContBB);
+
+      // Finally continue execution by phi-ing together the different
+      // computation paths.
+      CGF.EmitBlock(ContBB);
+      llvm::PHINode *RealPHI = Builder.CreatePHI(ResR->getType(), 3, "real_mul_phi");
+      RealPHI->addIncoming(ResR, OrigBB);
+      RealPHI->addIncoming(ResR, INaNBB);
+      RealPHI->addIncoming(LibCallR, LibCallBB);
+      llvm::PHINode *ImagPHI = Builder.CreatePHI(ResI->getType(), 3, "imag_mul_phi");
+      ImagPHI->addIncoming(ResI, OrigBB);
+      ImagPHI->addIncoming(ResI, INaNBB);
+      ImagPHI->addIncoming(LibCallI, LibCallBB);
+      return ComplexPairTy(RealPHI, ImagPHI);
     }
     assert((Op.LHS.second || Op.RHS.second) &&
            "At least one operand must be complex!");
diff --git a/clang/test/CodeGen/complex-math.c b/clang/test/CodeGen/complex-math.c
index 2bdf02b71998..29172fa6d77f 100644
--- a/clang/test/CodeGen/complex-math.c
+++ b/clang/test/CodeGen/complex-math.c
@@ -89,7 +89,17 @@ float _Complex mul_float_rc(float a, float _Complex b) {
 }
 float _Complex mul_float_cc(float _Complex a, float _Complex b) {
   // X86-LABEL: @mul_float_cc(
-  // X86-NOT: fmul
+  // X86: %[[AC:[^ ]+]] = fmul
+  // X86: %[[BD:[^ ]+]] = fmul
+  // X86: %[[AD:[^ ]+]] = fmul
+  // X86: %[[BC:[^ ]+]] = fmul
+  // X86: %[[RR:[^ ]+]] = fsub float %[[AC]], %[[BD]]
+  // X86: %[[RI:[^ ]+]] = fadd float
+  // X86-DAG: %[[AD]]
+  // X86-DAG: ,
+  // X86-DAG: %[[BC]]
+  // X86: fcmp uno float %[[RR]]
+  // X86: fcmp uno float %[[RI]]
   // X86: call {{.*}} @__mulsc3(
   // X86: ret
   return a * b;
@@ -211,7 +221,17 @@ double _Complex mul_double_rc(double a, double _Complex b) {
 }
 double _Complex mul_double_cc(double _Complex a, double _Complex b) {
   // X86-LABEL: @mul_double_cc(
-  // X86-NOT: fmul
+  // X86: %[[AC:[^ ]+]] = fmul
+  // X86: %[[BD:[^ ]+]] = fmul
+  // X86: %[[AD:[^ ]+]] = fmul
+  // X86: %[[BC:[^ ]+]] = fmul
+  // X86: %[[RR:[^ ]+]] = fsub double %[[AC]], %[[BD]]
+  // X86: %[[RI:[^ ]+]] = fadd double
+  // X86-DAG: %[[AD]]
+  // X86-DAG: ,
+  // X86-DAG: %[[BC]]
+  // X86: fcmp uno double %[[RR]]
+  // X86: fcmp uno double %[[RI]]
   // X86: call {{.*}} @__muldc3(
   // X86: ret
   return a * b;
@@ -333,11 +353,31 @@ long double _Complex mul_long_double_rc(long double a, long double _Complex b) {
 }
 long double _Complex mul_long_double_cc(long double _Complex a, long double _Complex b) {
   // X86-LABEL: @mul_long_double_cc(
-  // X86-NOT: fmul
+  // X86: %[[AC:[^ ]+]] = fmul
+  // X86: %[[BD:[^ ]+]] = fmul
+  // X86: %[[AD:[^ ]+]] = fmul
+  // X86: %[[BC:[^ ]+]] = fmul
+  // X86: %[[RR:[^ ]+]] = fsub x86_fp80 %[[AC]], %[[BD]]
+  // X86: %[[RI:[^ ]+]] = fadd x86_fp80
+  // X86-DAG: %[[AD]]
+  // X86-DAG: ,
+  // X86-DAG: %[[BC]]
+  // X86: fcmp uno x86_fp80 %[[RR]]
+  // X86: fcmp uno x86_fp80 %[[RI]]
   // X86: call {{.*}} @__mulxc3(
   // X86: ret
   // PPC-LABEL: @mul_long_double_cc(
-  // PPC-NOT: fmul
+  // PPC: %[[AC:[^ ]+]] = fmul
+  // PPC: %[[BD:[^ ]+]] = fmul
+  // PPC: %[[AD:[^ ]+]] = fmul
+  // PPC: %[[BC:[^ ]+]] = fmul
+  // PPC: %[[RR:[^ ]+]] = fsub ppc_fp128 %[[AC]], %[[BD]]
+  // PPC: %[[RI:[^ ]+]] = fadd ppc_fp128
+  // PPC-DAG: %[[AD]]
+  // PPC-DAG: ,
+  // PPC-DAG: %[[BC]]
+  // PPC: fcmp uno ppc_fp128 %[[RR]]
+  // PPC: fcmp uno ppc_fp128 %[[RI]]
   // PPC: call {{.*}} @__multc3(
   // PPC: ret
   return a * b;