R600/SI: Custom lower fround

This fixes it for SI. It also removes the pattern
used previously for Evergreen for f32. I'm not sure
if the the new R600 output is better or not, but it uses
1 fewer instructions if BFI is available.

llvm-svn: 226682
This commit is contained in:
Matt Arsenault 2015-01-21 18:18:25 +00:00
parent 94269db8ba
commit b00554886f
6 changed files with 240 additions and 50 deletions

View File

@ -127,9 +127,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FABS, MVT::f32, Legal);
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FRINT, MVT::f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Custom);
setOperationAction(ISD::FROUND, MVT::f64, Custom);
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
@ -610,6 +612,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
case ISD::FRINT: return LowerFRINT(Op, DAG);
case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
@ -1917,6 +1920,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
Hi,
DAG.getConstant(FractBits - 32, MVT::i32),
DAG.getConstant(ExpBits, MVT::i32));
SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
DAG.getConstant(1023, MVT::i32));
return Exp;
}
SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@ -1932,16 +1949,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
// exponent.
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
SDValue Exp = extractF64Exponent(Hi, SL, DAG);
// Extract the exponent.
SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
Hi,
DAG.getConstant(FractBits - 32, MVT::i32),
DAG.getConstant(ExpBits, MVT::i32));
SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
DAG.getConstant(1023, MVT::i32));
const unsigned FractBits = 52;
// Extract the sign bit.
const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
@ -2004,6 +2014,99 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
}
// XXX - May require not supporting f32 denormals?
SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
}
SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
const SDValue Zero = DAG.getConstant(0, MVT::i32);
const SDValue One = DAG.getConstant(1, MVT::i32);
const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
SDValue Exp = extractF64Exponent(Hi, SL, DAG);
const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
Exp);
SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
DAG.getConstant(0, MVT::i64), Tmp0,
ISD::SETNE);
SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
D, DAG.getConstant(0, MVT::i64));
SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
ExpEqNegOne,
DAG.getConstantFP(1.0, MVT::f64),
DAG.getConstantFP(0.0, MVT::f64));
SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
return K;
}
SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT == MVT::f32)
return LowerFROUND32(Op, DAG);
if (VT == MVT::f64)
return LowerFROUND64(Op, DAG);
llvm_unreachable("unhandled type");
}
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);

View File

@ -49,6 +49,10 @@ private:
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;

View File

@ -590,8 +590,6 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
// SHA-256 Patterns
def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
def : FROUNDPat <CNDGE_eg, CNDGT_eg>;
def EG_ExportSwz : ExportSwzInst {
let Word1{19-16} = 0; // BURST_COUNT
let Word1{20} = 0; // VALID_PIXEL_MODE

View File

@ -1142,16 +1142,6 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie
(exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
>;
// FROUND pattern
class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat <
(AMDGPUround f32:$x),
(CNDGE $x,
(CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)),
(CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
)
>;
//===----------------------------------------------------------------------===//
// R600 / R700 Instructions
//===----------------------------------------------------------------------===//
@ -1195,8 +1185,6 @@ let Predicates = [isR600] in {
def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
def : FROUNDPat <CNDGE_r600, CNDGT_r600>;
def R600_ExportSwz : ExportSwzInst {
let Word1{20-17} = 0; // BURST_COUNT
let Word1{21} = eop;

View File

@ -0,0 +1,74 @@
; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}round_f64:
; SI: s_endpgm
define void @round_f64(double addrspace(1)* %out, double %x) #0 {
%result = call double @llvm.round.f64(double %x) #1
store double %result, double addrspace(1)* %out
ret void
}
; This is a pretty large function, so just test a few of the
; instructions that are necessary.
; FUNC-LABEL: {{^}}v_round_f64:
; SI: buffer_load_dwordx2
; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
; SI: v_not_b32_e32
; SI: v_not_b32_e32
; SI: v_cmp_eq_i32
; SI: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
; SI: v_cmp_lt_i32_e64
; SI: v_cmp_gt_i32_e64
; SI: buffer_store_dwordx2
; SI: s_endpgm
define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep = getelementptr double addrspace(1)* %in, i32 %tid
%out.gep = getelementptr double addrspace(1)* %out, i32 %tid
%x = load double addrspace(1)* %gep
%result = call double @llvm.round.f64(double %x) #1
store double %result, double addrspace(1)* %out.gep
ret void
}
; FUNC-LABEL: {{^}}round_v2f64:
; SI: s_endpgm
define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
%result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
store <2 x double> %result, <2 x double> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}round_v4f64:
; SI: s_endpgm
define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
%result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
store <4 x double> %result, <4 x double> addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}round_v8f64:
; SI: s_endpgm
define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
%result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
store <8 x double> %result, <8 x double> addrspace(1)* %out
ret void
}
declare i32 @llvm.r600.read.tidig.x() #1
declare double @llvm.round.f64(double) #1
declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

View File

@ -1,17 +1,27 @@
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}f32:
; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
; R600-DAG: ADD {{.*}}, -0.5
; R600-DAG: CEIL {{.*}} [[ARG]]
; R600-DAG: FLOOR {{.*}} [[ARG]]
; R600-DAG: CNDGE
; R600-DAG: CNDGT
; R600: CNDGE {{[^,]+}}, [[ARG]]
define void @f32(float addrspace(1)* %out, float %in) {
entry:
%0 = call float @llvm.round.f32(float %in)
store float %0, float addrspace(1)* %out
; FUNC-LABEL: {{^}}round_f32:
; SI-DAG: s_load_dword [[SX:s[0-9]+]]
; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
; SI: v_cmp_ge_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SUB]]|, 0.5
; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
; SI: buffer_store_dword [[RESULT]]
; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
; R600-DAG: ADD {{.*}},
; R600-DAG: BFI_INT
; R600-DAG: SETGE
; R600-DAG: CNDE
; R600-DAG: ADD
define void @round_f32(float addrspace(1)* %out, float %x) #0 {
%result = call float @llvm.round.f32(float %x) #1
store float %result, float addrspace(1)* %out
ret void
}
@ -20,24 +30,37 @@ entry:
; a test for the scalar case, so the vector tests just check that the
; compiler doesn't crash.
; FUNC-LABEL: v2f32
; FUNC-LABEL: {{^}}round_v2f32:
; SI: s_endpgm
; R600: CF_END
define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
entry:
%0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in)
store <2 x float> %0, <2 x float> addrspace(1)* %out
define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
%result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
store <2 x float> %result, <2 x float> addrspace(1)* %out
ret void
}
; FUNC-LABEL: v4f32
; FUNC-LABEL: {{^}}round_v4f32:
; SI: s_endpgm
; R600: CF_END
define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
entry:
%0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in)
store <4 x float> %0, <4 x float> addrspace(1)* %out
define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
%result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
store <4 x float> %result, <4 x float> addrspace(1)* %out
ret void
}
declare float @llvm.round.f32(float)
declare <2 x float> @llvm.round.v2f32(<2 x float>)
declare <4 x float> @llvm.round.v4f32(<4 x float>)
; FUNC-LABEL: {{^}}round_v8f32:
; SI: s_endpgm
; R600: CF_END
define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
%result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
store <8 x float> %result, <8 x float> addrspace(1)* %out
ret void
}
declare float @llvm.round.f32(float) #1
declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }