[AMDGPU] Fixed occupancy calculation for gfx10
Differential Revision: https://reviews.llvm.org/D65010 llvm-svn: 366616
This commit is contained in:
parent
34da8dfba0
commit
7b5a54e369
|
@ -591,25 +591,12 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
|
|||
}
|
||||
|
||||
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
|
||||
if (VGPRs <= 24)
|
||||
return 10;
|
||||
if (VGPRs <= 28)
|
||||
return 9;
|
||||
if (VGPRs <= 32)
|
||||
return 8;
|
||||
if (VGPRs <= 36)
|
||||
return 7;
|
||||
if (VGPRs <= 40)
|
||||
return 6;
|
||||
if (VGPRs <= 48)
|
||||
return 5;
|
||||
if (VGPRs <= 64)
|
||||
return 4;
|
||||
if (VGPRs <= 84)
|
||||
return 3;
|
||||
if (VGPRs <= 128)
|
||||
return 2;
|
||||
return 1;
|
||||
unsigned MaxWaves = getMaxWavesPerEU();
|
||||
unsigned Granule = getVGPRAllocGranule();
|
||||
if (VGPRs < Granule)
|
||||
return MaxWaves;
|
||||
unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
|
||||
return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves);
|
||||
}
|
||||
|
||||
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
|
||||
|
|
|
@ -884,7 +884,7 @@ public:
|
|||
/// \returns Maximum number of waves per execution unit supported by the
|
||||
/// subtarget without any kind of limitation.
|
||||
unsigned getMaxWavesPerEU() const {
|
||||
return AMDGPU::IsaInfo::getMaxWavesPerEU();
|
||||
return AMDGPU::IsaInfo::getMaxWavesPerEU(this);
|
||||
}
|
||||
|
||||
/// \returns Number of waves per work group supported by the subtarget and
|
||||
|
|
|
@ -241,7 +241,7 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
|
|||
}
|
||||
|
||||
unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
|
||||
return getMaxWavesPerEU() * getEUsPerCU(STI);
|
||||
return getMaxWavesPerEU(STI) * getEUsPerCU(STI);
|
||||
}
|
||||
|
||||
unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
|
||||
|
@ -253,9 +253,11 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
unsigned getMaxWavesPerEU() {
|
||||
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
|
||||
// FIXME: Need to take scratch memory into account.
|
||||
return 10;
|
||||
if (!isGFX10(*STI))
|
||||
return 10;
|
||||
return 20;
|
||||
}
|
||||
|
||||
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
|
||||
|
@ -317,7 +319,7 @@ unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
|
|||
if (Version.Major >= 10)
|
||||
return 0;
|
||||
|
||||
if (WavesPerEU >= getMaxWavesPerEU())
|
||||
if (WavesPerEU >= getMaxWavesPerEU(STI))
|
||||
return 0;
|
||||
|
||||
unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1);
|
||||
|
@ -394,17 +396,19 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
|
|||
}
|
||||
|
||||
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
|
||||
return 256;
|
||||
if (!isGFX10(*STI))
|
||||
return 256;
|
||||
return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
|
||||
}
|
||||
|
||||
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
|
||||
return getTotalNumVGPRs(STI);
|
||||
return 256;
|
||||
}
|
||||
|
||||
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
|
||||
assert(WavesPerEU != 0);
|
||||
|
||||
if (WavesPerEU >= getMaxWavesPerEU())
|
||||
if (WavesPerEU >= getMaxWavesPerEU(STI))
|
||||
return 0;
|
||||
unsigned MinNumVGPRs =
|
||||
alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
|
||||
|
|
|
@ -94,7 +94,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
|
|||
|
||||
/// \returns Maximum number of waves per execution unit for given subtarget \p
|
||||
/// STI without any kind of limitation.
|
||||
unsigned getMaxWavesPerEU();
|
||||
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI);
|
||||
|
||||
/// \returns Maximum number of waves per execution unit for given subtarget \p
|
||||
/// STI and limited by given \p FlatWorkGroupSize.
|
||||
|
|
|
@ -74,7 +74,10 @@ entry:
|
|||
|
||||
; CHECK: .name: num_spilled_vgprs
|
||||
; CHECK: .symbol: num_spilled_vgprs.kd
|
||||
; CHECK: .vgpr_spill_count: 14
|
||||
; GFX700: .vgpr_spill_count: 14
|
||||
; GFX803: .vgpr_spill_count: 14
|
||||
; GFX900: .vgpr_spill_count: 14
|
||||
; GFX1010: .vgpr_spill_count: 0
|
||||
define amdgpu_kernel void @num_spilled_vgprs() #1 {
|
||||
%val0 = load volatile float, float addrspace(1)* @var
|
||||
%val1 = load volatile float, float addrspace(1)* @var
|
||||
|
|
|
@ -2356,9 +2356,9 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
|
|||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s6
|
||||
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v22, v11
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s0
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v20, 12, s8
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s8
|
||||
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v23, v23, v10
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v21, 12, s1
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v27, 12, s1
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s5
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s9
|
||||
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s7
|
||||
|
@ -2368,8 +2368,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
|
|||
; GFX10-DL-NEXT: v_and_b32_e32 v12, v16, v2
|
||||
; GFX10-DL-NEXT: v_and_b32_e32 v13, v17, v2
|
||||
; GFX10-DL-NEXT: v_and_b32_e32 v15, v19, v2
|
||||
; GFX10-DL-NEXT: v_and_b32_e32 v10, v21, v2
|
||||
; GFX10-DL-NEXT: v_and_b32_e32 v14, v20, v2
|
||||
; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2
|
||||
; GFX10-DL-NEXT: v_and_b32_e32 v14, v31, v2
|
||||
; GFX10-DL-NEXT: v_and_b32_sdwa v6, v23, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
|
|
@ -21,8 +21,8 @@ main_body:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_contig_nsa_10vgprs:
|
||||
; GCN-DAG: image_sample_c_l v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}],
|
||||
; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}],
|
||||
; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9:]+}}],
|
||||
; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9:]+}}],
|
||||
define amdgpu_ps <2 x float> @sample_contig_nsa_10vgprs(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) #0 {
|
||||
main_body:
|
||||
%zcompare.1 = fadd float %zcompare, 1.0
|
||||
|
|
|
@ -217,6 +217,12 @@ registers:
|
|||
- { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
|
||||
- { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
|
||||
- { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
|
||||
- { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
|
||||
- { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
|
||||
- { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
|
||||
- { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
|
||||
- { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
|
||||
- { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
|
||||
body: |
|
||||
bb.0:
|
||||
%0 = IMPLICIT_DEF
|
||||
|
@ -228,6 +234,12 @@ body: |
|
|||
%7 = IMPLICIT_DEF
|
||||
%8 = IMPLICIT_DEF
|
||||
%9 = IMPLICIT_DEF
|
||||
%10 = IMPLICIT_DEF
|
||||
%11 = IMPLICIT_DEF
|
||||
%12 = IMPLICIT_DEF
|
||||
%13 = IMPLICIT_DEF
|
||||
%14 = IMPLICIT_DEF
|
||||
%15 = IMPLICIT_DEF
|
||||
%2 = V_AND_B32_e32 %1, %0, implicit $exec
|
||||
GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec
|
||||
GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec
|
||||
|
|
|
@ -920,7 +920,7 @@ main_body:
|
|||
|
||||
; GCN-LABEL: {{^}}test_vgprblocks_w64_attr:
|
||||
; Test that the wave size can be overridden in function attributes and that the block size is correct as a result
|
||||
; GFX10DEFWAVE: ; VGPRBlocks: 2
|
||||
; GFX10DEFWAVE: ; VGPRBlocks: 11
|
||||
define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e,
|
||||
float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 {
|
||||
main_body:
|
||||
|
|
Loading…
Reference in New Issue