From a91f77eaac346e52a5ccc38c1ba02319a2aa86ac Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Thu, 24 Jan 2008 08:07:48 +0000 Subject: [PATCH] Significantly simplify and improve handling of FP function results on x86-32. This case returns the value in ST(0) and then has to convert it to an SSE register. This causes significant codegen ugliness in some cases. For example in the trivial fp-stack-direct-ret.ll testcase we used to generate: _bar: subl $28, %esp call L_foo$stub fstpl 16(%esp) movsd 16(%esp), %xmm0 movsd %xmm0, 8(%esp) fldl 8(%esp) addl $28, %esp ret because we move the result of foo() into an XMM register, then have to move it back for the return of bar. Instead of hacking ever-more special cases into the call result lowering code we take a much simpler approach: on x86-32, fp return is modeled as always returning into an f80 register which is then truncated to f32 or f64 as needed. Similarly for a result, we model it as an extension to f80 + return. This exposes the truncate and extensions to the dag combiner, allowing target independent code to hack on them, eliminating them in this case. This gives us this code for the example above: _bar: subl $12, %esp call L_foo$stub addl $12, %esp ret The nasty aspect of this is that these conversions are not legal, but we want the second pass of dag combiner (post-legalize) to be able to hack on them. To handle this, we lie to legalize and say they are legal, then custom expand them on entry to the isel pass (PreprocessForFPConvert). This is gross, but less gross than the code it is replacing :) This also allows us to generate better code in several other cases. For example on fp-stack-ret-conv.ll, we now generate: _test: subl $12, %esp call L_foo$stub fstps 8(%esp) movl 16(%esp), %eax cvtss2sd 8(%esp), %xmm0 movsd %xmm0, (%eax) addl $12, %esp ret where before we produced (incidentally, the old bad code is identical to what gcc produces): _test: subl $12, %esp call L_foo$stub fstpl (%esp) cvtsd2ss (%esp), %xmm0 cvtss2sd %xmm0, %xmm0 movl 16(%esp), %eax movsd %xmm0, (%eax) addl $12, %esp ret Note that we generate slightly worse code on pr1505b.ll due to a scheduling deficiency that is unrelated to this patch. llvm-svn: 46307 --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 77 ++++++++- llvm/lib/Target/X86/X86ISelLowering.cpp | 170 ++++++------------- llvm/lib/Target/X86/X86InstrSSE.td | 8 + llvm/test/CodeGen/X86/fp-stack-direct-ret.ll | 11 ++ llvm/test/CodeGen/X86/fp-stack-ret-conv.ll | 17 ++ llvm/test/CodeGen/X86/pr1505b.ll | 2 +- 6 files changed, 157 insertions(+), 128 deletions(-) create mode 100644 llvm/test/CodeGen/X86/fp-stack-direct-ret.ll create mode 100644 llvm/test/CodeGen/X86/fp-stack-ret-conv.ll diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 9887bcccef41..53cd6121d954 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -156,7 +156,8 @@ namespace { bool TryFoldLoad(SDOperand P, SDOperand N, SDOperand &Base, SDOperand &Scale, SDOperand &Index, SDOperand &Disp); - void InstructionSelectPreprocess(SelectionDAG &DAG); + void PreprocessForRMW(SelectionDAG &DAG); + void PreprocessForFPConvert(SelectionDAG &DAG); /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. @@ -350,9 +351,10 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load, Store.getOperand(2), Store.getOperand(3)); } -/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction -/// selector to pick more load-modify-store instructions. This is a common -/// case: +/// PreprocessForRMW - Preprocess the DAG to make instruction selection better. +/// This is only run if not in -fast mode (aka -O0). +/// This allows the instruction selector to pick more read-modify-write +/// instructions. This is a common case: /// /// [Load chain] /// ^ @@ -389,7 +391,7 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load, /// \ / /// \ / /// [Store] -void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) { +void X86DAGToDAGISel::PreprocessForRMW(SelectionDAG &DAG) { for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) { if (!ISD::isNON_TRUNCStore(I)) @@ -459,6 +461,66 @@ void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) { } } + +/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend +/// nodes that target the FP stack to be store and load to the stack. This is a +/// gross hack. We would like to simply mark these as being illegal, but when +/// we do that, legalize produces these when it expands calls, then expands +/// these in the same legalize pass. We would like dag combine to be able to +/// hack on these between the call expansion and the node legalization. As such +/// this pass basically does "really late" legalization of these inline with the +/// X86 isel pass. +void X86DAGToDAGISel::PreprocessForFPConvert(SelectionDAG &DAG) { + for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), + E = DAG.allnodes_end(); I != E; ) { + SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) + continue; + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + MVT::ValueType SrcVT = N->getOperand(0).getValueType(); + MVT::ValueType DstVT = N->getValueType(0); + bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) + continue; + + // If this is an FPStack extension (but not a truncation), it is a noop. + if (!SrcIsSSE && !DstIsSSE && N->getOpcode() == ISD::FP_EXTEND) + continue; + + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + MVT::ValueType MemVT; + if (N->getOpcode() == ISD::FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDOperand MemTmp = DAG.CreateStackTemporary(MemVT); + + // FIXME: optimize the case where the src/dest is a load or store? + SDOperand Store = DAG.getTruncStore(DAG.getEntryNode(), N->getOperand(0), + MemTmp, NULL, 0, MemVT); + SDOperand Result = DAG.getExtLoad(ISD::EXTLOAD, DstVT, Store, MemTmp, + NULL, 0, MemVT); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + DAG.ReplaceAllUsesOfValueWith(SDOperand(N, 0), Result); + + // Now that we did that, the node is dead. Increment the iterator to the + // next node to process, then delete N. + ++I; + DAG.DeleteNode(N); + } +} + /// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel /// when it has created a SelectionDAG for us to codegen. void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { @@ -466,7 +528,10 @@ void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { MachineFunction::iterator FirstMBB = BB; if (!FastISel) - InstructionSelectPreprocess(DAG); + PreprocessForRMW(DAG); + + // FIXME: This should only happen when not -fast. + PreprocessForFPConvert(DAG); // Codegen the basic block. #ifndef NDEBUG diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9140539e3828..9ca99fe6dd02 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -47,6 +47,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) X86ScalarSSEf32 = Subtarget->hasSSE1(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; + bool Fast = false; RegInfo = TM.getRegisterInfo(); @@ -355,13 +356,15 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - // Conversions to long double (in X87) go through memory. - setConvertAction(MVT::f32, MVT::f80, Expand); - setConvertAction(MVT::f64, MVT::f80, Expand); - - // Conversions from long double (in X87) go through memory. - setConvertAction(MVT::f80, MVT::f32, Expand); - setConvertAction(MVT::f80, MVT::f64, Expand); + // Floating truncations from f80 and extensions to f80 go through memory. + // If optimizing, we lie about this though and handle it in + // InstructionSelectPreprocess so that dagcombine2 can hack on these. + if (Fast) { + setConvertAction(MVT::f32, MVT::f80, Expand); + setConvertAction(MVT::f64, MVT::f80, Expand); + setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f80, MVT::f64, Expand); + } } else if (X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. @@ -395,15 +398,17 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - // SSE->x87 conversions go through memory. - setConvertAction(MVT::f32, MVT::f64, Expand); - setConvertAction(MVT::f32, MVT::f80, Expand); - - // x87->SSE truncations need to go through memory. - setConvertAction(MVT::f80, MVT::f32, Expand); - setConvertAction(MVT::f64, MVT::f32, Expand); - // And x87->x87 truncations also. - setConvertAction(MVT::f80, MVT::f64, Expand); + // SSE <-> X87 conversions go through memory. If optimizing, we lie about + // this though and handle it in InstructionSelectPreprocess so that + // dagcombine2 can hack on these. + if (Fast) { + setConvertAction(MVT::f32, MVT::f64, Expand); + setConvertAction(MVT::f32, MVT::f80, Expand); + setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f64, MVT::f32, Expand); + // And x87->x87 truncations also. + setConvertAction(MVT::f80, MVT::f64, Expand); + } if (!UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); @@ -420,10 +425,14 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - // Floating truncations need to go through memory. - setConvertAction(MVT::f80, MVT::f32, Expand); - setConvertAction(MVT::f64, MVT::f32, Expand); - setConvertAction(MVT::f80, MVT::f64, Expand); + // Floating truncations go through memory. If optimizing, we lie about + // this though and handle it in InstructionSelectPreprocess so that + // dagcombine2 can hack on these. + if (Fast) { + setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f64, MVT::f32, Expand); + setConvertAction(MVT::f80, MVT::f64, Expand); + } if (!UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); @@ -647,7 +656,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) } setTruncStoreAction(MVT::f64, MVT::f32, Expand); - + // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); @@ -808,30 +817,10 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { // a register. SDOperand Value = Op.getOperand(1); - // If this is an FP return with ScalarSSE, we need to move the value from - // an XMM register onto the fp-stack. - if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) { - SDOperand MemLoc; - - // If this is a load into a scalarsse value, don't store the loaded value - // back to the stack, only to reload it: just replace the scalar-sse load. - if (ISD::isNON_EXTLoad(Value.Val) && - Chain.reachesChainWithoutSideEffects(Value.getOperand(0))) { - Chain = Value.getOperand(0); - MemLoc = Value.getOperand(1); - } else { - // Spill the value to memory and reload it into top of stack. - unsigned Size = MVT::getSizeInBits(RVLocs[0].getValVT())/8; - MachineFunction &MF = DAG.getMachineFunction(); - int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); - MemLoc = DAG.getFrameIndex(SSFI, getPointerTy()); - Chain = DAG.getStore(Op.getOperand(0), Value, MemLoc, NULL, 0); - } - SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other); - SDOperand Ops[] = {Chain, MemLoc, DAG.getValueType(RVLocs[0].getValVT())}; - Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3); - Chain = Value.getValue(1); - } + // an XMM register onto the fp-stack. Do this with an FP_EXTEND to f80. + // This will get legalized into a load/store if it can't get optimized away. + if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) + Value = DAG.getNode(ISD::FP_EXTEND, MVT::f80, Value); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); SDOperand Ops[] = { Chain, Value }; @@ -876,87 +865,26 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, // Copies from the FP stack are special, as ST0 isn't a valid register // before the fp stackifier runs. - // Copy ST0 into an RFP register with FP_GET_RESULT. - SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other, MVT::Flag); + // Copy ST0 into an RFP register with FP_GET_RESULT. If this will end up + // in an SSE register, copy it out as F80 and do a truncate, otherwise use + // the specified value type. + MVT::ValueType GetResultTy = RVLocs[0].getValVT(); + if (isScalarFPTypeInSSEReg(GetResultTy)) + GetResultTy = MVT::f80; + SDVTList Tys = DAG.getVTList(GetResultTy, MVT::Other, MVT::Flag); + SDOperand GROps[] = { Chain, InFlag }; SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2); Chain = RetVal.getValue(1); InFlag = RetVal.getValue(2); - - // If we are using ScalarSSE, store ST(0) to the stack and reload it into - // an XMM register. - if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) { - SDOperand StoreLoc; - const Value *SrcVal = 0; - int SrcValOffset = 0; - MVT::ValueType RetStoreVT = RVLocs[0].getValVT(); - - // Determine where to store the value. If the call result is directly - // used by a store, see if we can store directly into the location. In - // this case, we'll end up producing a fst + movss[load] + movss[store] to - // the same location, and the two movss's will be nuked as dead. This - // optimizes common things like "*D = atof(..)" to not need an - // intermediate stack slot. - if (SDOperand(TheCall, 0).hasOneUse() && - SDOperand(TheCall, 1).hasOneUse()) { - // In addition to direct uses, we also support a FP_ROUND that uses the - // value, if it is directly stored somewhere. - SDNode *User = *TheCall->use_begin(); - if (User->getOpcode() == ISD::FP_ROUND && User->hasOneUse()) - User = *User->use_begin(); - - // Ok, we have one use of the value and one use of the chain. See if - // they are the same node: a store. - if (StoreSDNode *N = dyn_cast(User)) { - // Verify that the value being stored is either the call or a - // truncation of the call. - SDNode *StoreVal = N->getValue().Val; - if (StoreVal == TheCall) - ; // ok. - else if (StoreVal->getOpcode() == ISD::FP_ROUND && - StoreVal->hasOneUse() && - StoreVal->getOperand(0).Val == TheCall) - ; // ok. - else - N = 0; // not ok. - - if (N && N->getChain().Val == TheCall && - !N->isVolatile() && !N->isTruncatingStore() && - N->getAddressingMode() == ISD::UNINDEXED) { - StoreLoc = N->getBasePtr(); - SrcVal = N->getSrcValue(); - SrcValOffset = N->getSrcValueOffset(); - RetStoreVT = N->getValue().getValueType(); - } - } - } - // If we weren't able to optimize the result, just create a temporary - // stack slot. - if (StoreLoc.Val == 0) { - MachineFunction &MF = DAG.getMachineFunction(); - int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); - StoreLoc = DAG.getFrameIndex(SSFI, getPointerTy()); - } - - // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This - // shouldn't be necessary except that RFP cannot be live across - // multiple blocks (which could happen if a select gets lowered into - // multiple blocks and scheduled in between them). When stackifier is - // fixed, they can be uncoupled. - SDOperand Ops[] = { - Chain, RetVal, StoreLoc, DAG.getValueType(RetStoreVT), InFlag - }; - Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5); - RetVal = DAG.getLoad(RetStoreVT, Chain, - StoreLoc, SrcVal, SrcValOffset); - Chain = RetVal.getValue(1); - - // If we optimized a truncate, then extend the result back to its desired - // type. - if (RVLocs[0].getValVT() != RetStoreVT) - RetVal = DAG.getNode(ISD::FP_EXTEND, RVLocs[0].getValVT(), RetVal); - } + // If we want the result in an SSE register, use an FP_TRUNCATE to get it + // there. + if (GetResultTy != RVLocs[0].getValVT()) + RetVal = DAG.getNode(ISD::FP_ROUND, RVLocs[0].getValVT(), RetVal, + // This truncation won't change the value. + DAG.getIntPtrConstant(1)); + ResultVals.push_back(RetVal); } diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index a73d5afc2fcc..30d088c4ad59 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2734,6 +2734,14 @@ def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; +// extload f32 -> f64. This matches load+fextend because we have a hack in +// the isel (PreprocessForFPConvert) that can introduce loads after dag combine. +// Since these loads aren't folded into the fextend, we have to match it +// explicitly here. +let Predicates = [HasSSE2] in + def : Pat<(fextend (loadf32 addr:$src)), + (CVTSS2SDrm addr:$src)>; + // Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or // 16-bits matter. def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, diff --git a/llvm/test/CodeGen/X86/fp-stack-direct-ret.ll b/llvm/test/CodeGen/X86/fp-stack-direct-ret.ll new file mode 100644 index 000000000000..78be2a39defb --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-stack-direct-ret.ll @@ -0,0 +1,11 @@ +; RUN: llvm-as < %s | llc -march=x86 | not grep fstp +; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | not grep movsd + +declare double @foo() + +define double @bar() { +entry: + %tmp5 = tail call double @foo() + ret double %tmp5 +} + diff --git a/llvm/test/CodeGen/X86/fp-stack-ret-conv.ll b/llvm/test/CodeGen/X86/fp-stack-ret-conv.ll new file mode 100644 index 000000000000..5254e1c89f61 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-stack-ret-conv.ll @@ -0,0 +1,17 @@ +; RUN: llvm-as < %s | llc -mcpu=yonah | grep cvtss2sd +; RUN: llvm-as < %s | llc -mcpu=yonah | grep fstps +; RUN: llvm-as < %s | llc -mcpu=yonah | not grep cvtsd2ss + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64" +target triple = "i686-apple-darwin8" + +define void @test(double *%b) { +entry: + %tmp13 = tail call double @foo() + %tmp1314 = fptrunc double %tmp13 to float ; [#uses=1] + %tmp3940 = fpext float %tmp1314 to double ; [#uses=1] + volatile store double %tmp3940, double* %b + ret void +} + +declare double @foo() diff --git a/llvm/test/CodeGen/X86/pr1505b.ll b/llvm/test/CodeGen/X86/pr1505b.ll index d5e3507583a5..73cb23e7ed46 100644 --- a/llvm/test/CodeGen/X86/pr1505b.ll +++ b/llvm/test/CodeGen/X86/pr1505b.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstpl | count 3 +; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstpl | count 4 ; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstps | count 3 ; PR1505