From 5dea6451389b97186760db15c3b1c3fc033d5ad1 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 24 Apr 2017 17:17:36 +0000 Subject: [PATCH] AMDGPU: Move v_readlane lane select from VGPR to SGPR Summary: Fix a compiler bug when the lane select happens to end up in a VGPR. Clarify the semantic of the corresponding intrinsic to be that of the corresponding GLSL: the lane select must be uniform across a wave front, otherwise results are undefined. Reviewers: arsenm Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D32343 llvm-svn: 301197 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 +++++++++++++ .../test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 17 +++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 5415c6b0d151..21d8a15e7e7a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -629,6 +629,8 @@ def int_amdgcn_readfirstlane : GCCBuiltin<"__builtin_amdgcn_readfirstlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; +// The lane argument must be uniform across the currently active threads of the +// current wave. Otherwise, the result is undefined. def int_amdgcn_readlane : GCCBuiltin<"__builtin_amdgcn_readlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d51110bcbd60..c5af8a1ad925 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2640,6 +2640,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) return; + // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for + // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane + // select is uniform. + if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && + RI.isVGPR(MRI, Src1.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + return; + } + // We do not use commuteInstruction here because it is too aggressive and will // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 5e892fad3741..cbd8f0a9c23a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -19,6 +19,20 @@ define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %s ret void } +; CHECK-LABEL: {{^}}test_readlane_vregs: +; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}} +; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]] +define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid + %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in + %value = extractelement <2 x i32> %args, i32 0 + %lane = extractelement <2 x i32> %args, i32 1 + %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane) + store i32 %readlane, i32 addrspace(1)* %out, align 4 + ret void +} + ; TODO: m0 should be folded. ; CHECK-LABEL: {{^}}test_readlane_m0_sreg: ; CHECK: s_mov_b32 m0, -1 @@ -40,5 +54,8 @@ define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) ret void } +declare i32 @llvm.amdgcn.workitem.id.x() #2 + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } +attributes #2 = { nounwind readnone }