diff --git a/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 3c41638b7c9a..5132d453781b 100644 --- a/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -203,13 +203,14 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { N->setNodeId(-1); return NULL; // Already selected. } + + const AMDGPUSubtarget &ST = TM.getSubtarget(); switch (Opc) { default: break; // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. case ISD::ADD: { - const AMDGPUSubtarget &ST = TM.getSubtarget(); if (N->getValueType(0) != MVT::i64 || ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; @@ -255,7 +256,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::BUILD_VECTOR: { unsigned RegClassID; - const AMDGPUSubtarget &ST = TM.getSubtarget(); const AMDGPURegisterInfo *TRI = static_cast(TM.getRegisterInfo()); const SIRegisterInfo *SIRI = @@ -342,7 +342,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; - const AMDGPUSubtarget &ST = TM.getSubtarget(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } @@ -393,7 +392,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case AMDGPUISD::REGISTER_LOAD: { - const AMDGPUSubtarget &ST = TM.getSubtarget(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; @@ -410,7 +408,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Ops); } case AMDGPUISD::REGISTER_STORE: { - const AMDGPUSubtarget &ST = TM.getSubtarget(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; SDValue Addr, Offset; @@ -426,6 +423,47 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { CurDAG->getVTList(MVT::Other), Ops); } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + // There is a scalar version available, but unlike the vector version which + // has a separate operand for the offset and width, the scalar version packs + // the width and offset into a single operand. Try to move to the scalar + // version if the offsets are constant, so that we can try to keep extended + // loads of kernel arguments in SGPRs. + + // TODO: Technically we could try to pattern match scalar bitshifts of + // dynamic values, but it's probably not useful. + ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); + if (!Offset) + break; + + ConstantSDNode *Width = dyn_cast(N->getOperand(2)); + if (!Width) + break; + + bool Signed = Opc == AMDGPUISD::BFE_I32; + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + + uint32_t OffsetVal = Offset->getZExtValue(); + uint32_t WidthVal = Width->getZExtValue(); + + uint32_t PackedVal = OffsetVal | WidthVal << 16; + + SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32); + return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, + SDLoc(N), + MVT::i32, + N->getOperand(0), + PackedOffsetWidth); + + } } return SelectCode(N); } diff --git a/llvm/lib/Target/R600/SIInstrInfo.cpp b/llvm/lib/Target/R600/SIInstrInfo.cpp index 21f7a81700a6..96eeea562794 100644 --- a/llvm/lib/Target/R600/SIInstrInfo.cpp +++ b/llvm/lib/Target/R600/SIInstrInfo.cpp @@ -539,6 +539,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; + case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; @@ -1009,6 +1011,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { addDescImplicitUseDef(NewDesc, Inst); + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + Inst->RemoveOperand(2); // Remove old immediate. + Inst->addOperand(MachineOperand::CreateImm(Offset)); + Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + } + // Update the destination register class. const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); diff --git a/llvm/test/CodeGen/R600/sext-in-reg.ll b/llvm/test/CodeGen/R600/sext-in-reg.ll index a47da2b25aca..b722959aad57 100644 --- a/llvm/test/CodeGen/R600/sext-in-reg.ll +++ b/llvm/test/CodeGen/R600/sext-in-reg.ll @@ -1,12 +1,13 @@ -; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone ; FUNC-LABEL: @sext_in_reg_i1_i32 ; SI: S_LOAD_DWORD [[ARG:s[0-9]+]], -; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1 +; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000 +; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]] ; SI: BUFFER_STORE_DWORD [[EXTRACT]], ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] @@ -148,8 +149,8 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. ; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64 -; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 -; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, +; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288 +; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31 ; XSI: BUFFER_STORE_DWORD ; XEG: BFE_INT ; XEG: ASHR @@ -204,8 +205,8 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out ; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32 -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 ; SI: BUFFER_STORE_DWORDX2 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] @@ -221,10 +222,10 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % } ; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32 -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 ; SI: BUFFER_STORE_DWORDX4 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] @@ -320,6 +321,34 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { ret void } +; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32 +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { + %loada = load <4 x i32> addrspace(1)* %a, align 16 + %loadb = load <4 x i32> addrspace(1)* %b, align 16 + %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32 +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 +define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { + %loada = load <4 x i32> addrspace(1)* %a, align 16 + %loadb = load <4 x i32> addrspace(1)* %b, align 16 + %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + ; FIXME: The BFE should really be eliminated. I think it should happen ; when computeMaskedBitsForTargetNode is implemented for imax.