AMDGPU: Partially fix handling of packed amdgpu_ps arguments

Fixes annoying limitations when writing tests.
Also remove more leftover code for manually scalarizing arguments
and return values.

llvm-svn: 338618
This commit is contained in:
Matt Arsenault 2018-08-01 19:57:34 +00:00
parent b3724b7169
commit 55ab9213d3
3 changed files with 95 additions and 69 deletions

View File

@ -19,7 +19,7 @@ class CCIfExtend<CCAction A>
// Calling convention for SI
def CC_SI : CallingConv<[
CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@ -33,7 +33,7 @@ def CC_SI : CallingConv<[
CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@ -64,7 +64,7 @@ def RetCC_SI_Shader : CallingConv<[
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
CCIfType<[f32, f16] , CCAssignToReg<[
CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,

View File

@ -1349,7 +1349,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
assert(!Arg->VT.isVector() && "vector type argument should have been split");
assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
"vector type argument should have been split");
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
@ -1951,29 +1952,6 @@ SDValue SITargetLowering::LowerFormalArguments(
llvm_unreachable("Unknown loc info!");
}
if (IsShader && Arg.VT.isVector()) {
// Build a vector from the registers
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
SmallVector<SDValue, 4> Regs;
Regs.push_back(Val);
for (unsigned j = 1; j != NumElements; ++j) {
Reg = ArgLocs[ArgIdx++].getLocReg();
Reg = MF.addLiveIn(Reg, RC);
SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
Regs.push_back(Copy);
}
// Fill up the missing vector elements
NumElements = Arg.VT.getVectorNumElements() - NumElements;
Regs.append(NumElements, DAG.getUNDEF(VT));
InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
continue;
}
InVals.push_back(Val);
}
@ -2037,48 +2015,19 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsShader = AMDGPU::isShader(CallConv);
Info->setIfReturnsVoid(Outs.size() == 0);
Info->setIfReturnsVoid(Outs.empty());
bool IsWaveEnd = Info->returnsVoid() && IsShader;
SmallVector<ISD::OutputArg, 48> Splits;
SmallVector<SDValue, 48> SplitVals;
// Split vectors into their elements.
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
const ISD::OutputArg &Out = Outs[i];
if (IsShader && Out.VT.isVector()) {
MVT VT = Out.VT.getVectorElementType();
ISD::OutputArg NewOut = Out;
NewOut.Flags.setSplit();
NewOut.VT = VT;
// We want the original number of vector elements here, e.g.
// three or five, not four or eight.
unsigned NumElements = Out.ArgVT.getVectorNumElements();
for (unsigned j = 0; j != NumElements; ++j) {
SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
DAG.getConstant(j, DL, MVT::i32));
SplitVals.push_back(Elem);
Splits.push_back(NewOut);
NewOut.PartOffset += NewOut.VT.getStoreSize();
}
} else {
SplitVals.push_back(OutVals[i]);
Splits.push_back(Out);
}
}
// CCValAssign - represent the assignment of the return value to a location.
SmallVector<CCValAssign, 48> RVLocs;
SmallVector<ISD::OutputArg, 48> Splits;
// CCState - Info about the registers and stack slots.
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze outgoing return values.
CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
SDValue Flag;
SmallVector<SDValue, 48> RetOps;
@ -2103,14 +2052,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
// Copy the result values into the output registers.
for (unsigned i = 0, realRVLocIdx = 0;
i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
++I, ++RealRVLocIdx) {
CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
// TODO: Partially return in registers if return values don't fit.
SDValue Arg = SplitVals[realRVLocIdx];
SDValue Arg = OutVals[RealRVLocIdx];
// Copied from other backends.
switch (VA.getLocInfo()) {

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; Make sure we don't crash or assert on spir_kernel calling convention.
@ -88,8 +88,8 @@ define amdgpu_cs half @cs_mesa(half %arg0) {
; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config
; GCN-LABEL: .AMDGPU.config
; GCN: .long 45096
; GCN-LABEL: {{^}}ps_mesa:
define amdgpu_ps half @ps_mesa(half %arg0) {
; GCN-LABEL: {{^}}ps_mesa_f16:
define amdgpu_ps half @ps_mesa_f16(half %arg0) {
%add = fadd half %arg0, 1.0
ret half %add
}
@ -121,4 +121,83 @@ define amdgpu_hs half @hs_mesa(half %arg0) {
ret half %add
}
; FIXME: Inconsistent ABI between targets
; GCN-LABEL: {{^}}ps_mesa_v2f16:
; VI: v_mov_b32_e32 v1, 0x3c00
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: ; return
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], v0
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], v1
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
; SI: ; return to shader part epilog
define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
%add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
ret <2 x half> %add
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v2f16:
; VI: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_e64 v1, s0, 1.0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: ; return to shader part epilog
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], s0
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], s1
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
; SI: ; return to shader part epilog
define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
%add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
ret <2 x half> %add
}
; GCN-LABEL: {{^}}ps_mesa_v2i16:
; VI: v_mov_b32_e32 v2, 1
; VI: v_add_u16_e32 v1, 1, v0
; VI: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI: v_or_b32_e32 v0, v1, v0
; SI: v_lshlrev_b32_e32 v1, 16, v1
; SI: v_add_i32_e32 v0, vcc, 1, v0
; SI: v_add_i32_e32 v1, vcc, 0x10000, v1
; SI: v_and_b32
; SI: v_or_b32
define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
%add = add <2 x i16> %arg0, <i16 1, i16 1>
store <2 x i16> %add, <2 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16:
; VI: s_lshr_b32 s1, s0, 16
; VI: s_add_i32 s1, s1, 1
; VI: s_add_i32 s0, s0, 1
; VI: s_and_b32 s0, s0, 0xffff
; VI: s_lshl_b32 s1, s1, 16
; VI: s_or_b32 s0, s0, s1
; VI: v_mov_b32_e32 v0, s0
; SI: s_lshl_b32 s1, s1, 16
; SI: s_add_i32 s0, s0, 1
; SI: s_add_i32 s1, s1, 0x10000
; SI: s_and_b32 s0, s0, 0xffff
; SI: s_or_b32 s0, s0, s1
define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
%add = add <2 x i16> %arg0, <i16 1, i16 1>
store <2 x i16> %add, <2 x i16> addrspace(1)* undef
ret void
}
attributes #0 = { nounwind noinline }