From 0c4b230b32ba04494deeabc4415f7248aa455068 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sun, 19 Oct 2014 19:13:49 +0000 Subject: [PATCH] [complex] Teach the complex math IR gen to emit direct math and a NaN-test prior to the call to the library function. This should automatically make fastmath (including just non-NaNs) able to avoid the expensive libcalls and also open the door to more advanced folding in LLVM based on the rules for complex math. Two important notes to remember: first is that this isn't yet a proper limited range mode, it's still just improving the unlimited range mode. Also, it isn't really perfecet w.r.t. what an unlimited range mode should be doing because it isn't quite handling the flags produced by all the operations in the way desirable for that mode, but then neither is compiler-rt's libcall. When the compiler-rt libcall is improved to carefully manage flags, the code emitted here should be improved correspondingly. And it is still a long-term desirable thing to add a limited range mode to Clang that would be able to use direct math without library calls here. Special thanks to Steve Canon for the careful review on this patch and teaching me about these issues. =D Differential Revision: http://reviews.llvm.org/D5756 llvm-svn: 220167 --- clang/lib/CodeGen/CGExprComplex.cpp | 100 +++++++++++++++++++++++----- clang/test/CodeGen/complex-math.c | 48 +++++++++++-- 2 files changed, 127 insertions(+), 21 deletions(-) diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp index b1cc1efa58e5..e957256ab5d7 100644 --- a/clang/lib/CodeGen/CGExprComplex.cpp +++ b/clang/lib/CodeGen/CGExprComplex.cpp @@ -15,9 +15,13 @@ #include "CodeGenModule.h" #include "clang/AST/ASTContext.h" #include "clang/AST/StmtVisitor.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include using namespace clang; using namespace CodeGen; @@ -587,11 +591,31 @@ ComplexPairTy ComplexExprEmitter::EmitComplexBinOpLibCall(StringRef LibCallName, return CGF.EmitCall(FuncInfo, Func, ReturnValueSlot(), Args).getComplexVal(); } +/// \brief Lookup the libcall name for a given floating point type complex +/// multiply. +static StringRef getComplexMultiplyLibCallName(llvm::Type *Ty) { + switch (Ty->getTypeID()) { + default: + llvm_unreachable("Unsupported floating point type!"); + case llvm::Type::HalfTyID: + return "__mulhc3"; + case llvm::Type::FloatTyID: + return "__mulsc3"; + case llvm::Type::DoubleTyID: + return "__muldc3"; + case llvm::Type::PPC_FP128TyID: + return "__multc3"; + case llvm::Type::X86_FP80TyID: + return "__mulxc3"; + } +} + // See C11 Annex G.5.1 for the semantics of multiplicative operators on complex // typed values. ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) { using llvm::Value; Value *ResR, *ResI; + llvm::MDBuilder MDHelper(CGF.getLLVMContext()); if (Op.LHS.first->getType()->isFloatingPointTy()) { // The general formulation is: @@ -603,23 +627,65 @@ ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) { // still more of this within the type system. if (Op.LHS.second && Op.RHS.second) { - // If both operands are complex, delegate to a libcall which works to - // prevent underflow and overflow. - StringRef LibCallName; - switch (Op.LHS.first->getType()->getTypeID()) { - default: - llvm_unreachable("Unsupported floating point type!"); - case llvm::Type::HalfTyID: - return EmitComplexBinOpLibCall("__mulhc3", Op); - case llvm::Type::FloatTyID: - return EmitComplexBinOpLibCall("__mulsc3", Op); - case llvm::Type::DoubleTyID: - return EmitComplexBinOpLibCall("__muldc3", Op); - case llvm::Type::PPC_FP128TyID: - return EmitComplexBinOpLibCall("__multc3", Op); - case llvm::Type::X86_FP80TyID: - return EmitComplexBinOpLibCall("__mulxc3", Op); - } + // If both operands are complex, emit the core math directly, and then + // test for NaNs. If we find NaNs in the result, we delegate to a libcall + // to carefully re-compute the correct infinity representation if + // possible. The expectation is that the presence of NaNs here is + // *extremely* rare, and so the cost of the libcall is almost irrelevant. + // This is good, because the libcall re-computes the core multiplication + // exactly the same as we do here and re-tests for NaNs in order to be + // a generic complex*complex libcall. + + // First compute the four products. + Value *AC = Builder.CreateFMul(Op.LHS.first, Op.RHS.first, "mul_ac"); + Value *BD = Builder.CreateFMul(Op.LHS.second, Op.RHS.second, "mul_bd"); + Value *AD = Builder.CreateFMul(Op.LHS.first, Op.RHS.second, "mul_ad"); + Value *BC = Builder.CreateFMul(Op.LHS.second, Op.RHS.first, "mul_bc"); + + // The real part is the difference of the first two, the imaginary part is + // the sum of the second. + ResR = Builder.CreateFSub(AC, BD, "mul_r"); + ResI = Builder.CreateFAdd(AD, BC, "mul_i"); + + // Emit the test for the real part becoming NaN and create a branch to + // handle it. We test for NaN by comparing the number to itself. + Value *IsRNaN = Builder.CreateFCmpUNO(ResR, ResR, "isnan_cmp"); + llvm::BasicBlock *ContBB = CGF.createBasicBlock("complex_mul_cont"); + llvm::BasicBlock *INaNBB = CGF.createBasicBlock("complex_mul_imag_nan"); + llvm::Instruction *Branch = Builder.CreateCondBr(IsRNaN, INaNBB, ContBB); + llvm::BasicBlock *OrigBB = Branch->getParent(); + + // Give hint that we very much don't expect to see NaNs. + // Value chosen to match UR_NONTAKEN_WEIGHT, see BranchProbabilityInfo.cpp + llvm::MDNode *BrWeight = MDHelper.createBranchWeights(1, (1U << 20) - 1); + Branch->setMetadata(llvm::LLVMContext::MD_prof, BrWeight); + + // Now test the imaginary part and create its branch. + CGF.EmitBlock(INaNBB); + Value *IsINaN = Builder.CreateFCmpUNO(ResI, ResI, "isnan_cmp"); + llvm::BasicBlock *LibCallBB = CGF.createBasicBlock("complex_mul_libcall"); + Branch = Builder.CreateCondBr(IsINaN, LibCallBB, ContBB); + Branch->setMetadata(llvm::LLVMContext::MD_prof, BrWeight); + + // Now emit the libcall on this slowest of the slow paths. + CGF.EmitBlock(LibCallBB); + Value *LibCallR, *LibCallI; + std::tie(LibCallR, LibCallI) = EmitComplexBinOpLibCall( + getComplexMultiplyLibCallName(Op.LHS.first->getType()), Op); + Builder.CreateBr(ContBB); + + // Finally continue execution by phi-ing together the different + // computation paths. + CGF.EmitBlock(ContBB); + llvm::PHINode *RealPHI = Builder.CreatePHI(ResR->getType(), 3, "real_mul_phi"); + RealPHI->addIncoming(ResR, OrigBB); + RealPHI->addIncoming(ResR, INaNBB); + RealPHI->addIncoming(LibCallR, LibCallBB); + llvm::PHINode *ImagPHI = Builder.CreatePHI(ResI->getType(), 3, "imag_mul_phi"); + ImagPHI->addIncoming(ResI, OrigBB); + ImagPHI->addIncoming(ResI, INaNBB); + ImagPHI->addIncoming(LibCallI, LibCallBB); + return ComplexPairTy(RealPHI, ImagPHI); } assert((Op.LHS.second || Op.RHS.second) && "At least one operand must be complex!"); diff --git a/clang/test/CodeGen/complex-math.c b/clang/test/CodeGen/complex-math.c index 2bdf02b71998..29172fa6d77f 100644 --- a/clang/test/CodeGen/complex-math.c +++ b/clang/test/CodeGen/complex-math.c @@ -89,7 +89,17 @@ float _Complex mul_float_rc(float a, float _Complex b) { } float _Complex mul_float_cc(float _Complex a, float _Complex b) { // X86-LABEL: @mul_float_cc( - // X86-NOT: fmul + // X86: %[[AC:[^ ]+]] = fmul + // X86: %[[BD:[^ ]+]] = fmul + // X86: %[[AD:[^ ]+]] = fmul + // X86: %[[BC:[^ ]+]] = fmul + // X86: %[[RR:[^ ]+]] = fsub float %[[AC]], %[[BD]] + // X86: %[[RI:[^ ]+]] = fadd float + // X86-DAG: %[[AD]] + // X86-DAG: , + // X86-DAG: %[[BC]] + // X86: fcmp uno float %[[RR]] + // X86: fcmp uno float %[[RI]] // X86: call {{.*}} @__mulsc3( // X86: ret return a * b; @@ -211,7 +221,17 @@ double _Complex mul_double_rc(double a, double _Complex b) { } double _Complex mul_double_cc(double _Complex a, double _Complex b) { // X86-LABEL: @mul_double_cc( - // X86-NOT: fmul + // X86: %[[AC:[^ ]+]] = fmul + // X86: %[[BD:[^ ]+]] = fmul + // X86: %[[AD:[^ ]+]] = fmul + // X86: %[[BC:[^ ]+]] = fmul + // X86: %[[RR:[^ ]+]] = fsub double %[[AC]], %[[BD]] + // X86: %[[RI:[^ ]+]] = fadd double + // X86-DAG: %[[AD]] + // X86-DAG: , + // X86-DAG: %[[BC]] + // X86: fcmp uno double %[[RR]] + // X86: fcmp uno double %[[RI]] // X86: call {{.*}} @__muldc3( // X86: ret return a * b; @@ -333,11 +353,31 @@ long double _Complex mul_long_double_rc(long double a, long double _Complex b) { } long double _Complex mul_long_double_cc(long double _Complex a, long double _Complex b) { // X86-LABEL: @mul_long_double_cc( - // X86-NOT: fmul + // X86: %[[AC:[^ ]+]] = fmul + // X86: %[[BD:[^ ]+]] = fmul + // X86: %[[AD:[^ ]+]] = fmul + // X86: %[[BC:[^ ]+]] = fmul + // X86: %[[RR:[^ ]+]] = fsub x86_fp80 %[[AC]], %[[BD]] + // X86: %[[RI:[^ ]+]] = fadd x86_fp80 + // X86-DAG: %[[AD]] + // X86-DAG: , + // X86-DAG: %[[BC]] + // X86: fcmp uno x86_fp80 %[[RR]] + // X86: fcmp uno x86_fp80 %[[RI]] // X86: call {{.*}} @__mulxc3( // X86: ret // PPC-LABEL: @mul_long_double_cc( - // PPC-NOT: fmul + // PPC: %[[AC:[^ ]+]] = fmul + // PPC: %[[BD:[^ ]+]] = fmul + // PPC: %[[AD:[^ ]+]] = fmul + // PPC: %[[BC:[^ ]+]] = fmul + // PPC: %[[RR:[^ ]+]] = fsub ppc_fp128 %[[AC]], %[[BD]] + // PPC: %[[RI:[^ ]+]] = fadd ppc_fp128 + // PPC-DAG: %[[AD]] + // PPC-DAG: , + // PPC-DAG: %[[BC]] + // PPC: fcmp uno ppc_fp128 %[[RR]] + // PPC: fcmp uno ppc_fp128 %[[RI]] // PPC: call {{.*}} @__multc3( // PPC: ret return a * b;