R600/SI: Try to use scalar BFE.

Use scalar BFE with constant shift and offset when possible.
This is complicated by the fact that the scalar version packs
the two operands of the vector version into one.

llvm-svn: 206558
This commit is contained in:
Matt Arsenault 2014-04-18 05:19:26 +00:00
parent e56c30614f
commit 78b8670aac
3 changed files with 106 additions and 16 deletions

View File

@ -203,13 +203,14 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
N->setNodeId(-1);
return NULL; // Already selected.
}
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
switch (Opc) {
default: break;
// We are selecting i64 ADD here instead of custom lower it during
// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.
case ISD::ADD: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (N->getValueType(0) != MVT::i64 ||
ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
@ -255,7 +256,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
case ISD::BUILD_VECTOR: {
unsigned RegClassID;
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
const AMDGPURegisterInfo *TRI =
static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
const SIRegisterInfo *SIRI =
@ -342,7 +342,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
break;
}
@ -393,7 +392,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
case AMDGPUISD::REGISTER_LOAD: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
@ -410,7 +408,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
Ops);
}
case AMDGPUISD::REGISTER_STORE: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
break;
SDValue Addr, Offset;
@ -426,6 +423,47 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
CurDAG->getVTList(MVT::Other),
Ops);
}
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
break;
// There is a scalar version available, but unlike the vector version which
// has a separate operand for the offset and width, the scalar version packs
// the width and offset into a single operand. Try to move to the scalar
// version if the offsets are constant, so that we can try to keep extended
// loads of kernel arguments in SGPRs.
// TODO: Technically we could try to pattern match scalar bitshifts of
// dynamic values, but it's probably not useful.
ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!Offset)
break;
ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
if (!Width)
break;
bool Signed = Opc == AMDGPUISD::BFE_I32;
// Transformation function, pack the offset and width of a BFE into
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
// source, bits [5:0] contain the offset and bits [22:16] the width.
uint32_t OffsetVal = Offset->getZExtValue();
uint32_t WidthVal = Width->getZExtValue();
uint32_t PackedVal = OffsetVal | WidthVal << 16;
SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
SDLoc(N),
MVT::i32,
N->getOperand(0),
PackedOffsetWidth);
}
}
return SelectCode(N);
}

View File

@ -539,6 +539,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
@ -1009,6 +1011,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
addDescImplicitUseDef(NewDesc, Inst);
if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
// If we need to move this to VGPRs, we need to unpack the second operand
// back into the 2 separate ones for bit offset and width.
assert(OffsetWidthOp.isImm() &&
"Scalar BFE is only implemented for constant width and offset");
uint32_t Imm = OffsetWidthOp.getImm();
uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
Inst->RemoveOperand(2); // Remove old immediate.
Inst->addOperand(MachineOperand::CreateImm(Offset));
Inst->addOperand(MachineOperand::CreateImm(BitWidth));
Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(0));
}
// Update the destination register class.
const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);

View File

@ -1,12 +1,13 @@
; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
; FUNC-LABEL: @sext_in_reg_i1_i32
; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
; SI: BUFFER_STORE_DWORD [[EXTRACT]],
; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
@ -148,8 +149,8 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
; XSI: BUFFER_STORE_DWORD
; XEG: BFE_INT
; XEG: ASHR
@ -204,8 +205,8 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: BUFFER_STORE_DWORDX2
; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@ -221,10 +222,10 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
}
; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
; SI: BUFFER_STORE_DWORDX4
; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@ -320,6 +321,34 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
ret void
}
; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
%loada = load <4 x i32> addrspace(1)* %a, align 16
%loadb = load <4 x i32> addrspace(1)* %b, align 16
%c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
%ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
%loada = load <4 x i32> addrspace(1)* %a, align 16
%loadb = load <4 x i32> addrspace(1)* %b, align 16
%c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
%ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
ret void
}
; FIXME: The BFE should really be eliminated. I think it should happen
; when computeMaskedBitsForTargetNode is implemented for imax.