R600/SI: Implement isLegalAddressingMode

The default assumes that a 16-bit signed offset is used.
LDS instruction use a 16-bit unsigned offset, so it wasn't
being used in some cases where it was assumed a negative offset
could be used.

More should be done here, but first isLegalAddressingMode needs
to gain an addressing mode argument. For now, copy most of the rest
of the default implementation with the immediate offset change.

llvm-svn: 215732
This commit is contained in:
Matt Arsenault 2014-08-15 17:17:07 +00:00
parent 1e08577586
commit 5015a89aa5
3 changed files with 107 additions and 0 deletions

View File

@ -242,6 +242,49 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
// TargetLowering queries
//===----------------------------------------------------------------------===//
// FIXME: This really needs an address space argument. The immediate offset
// size is different for different sets of memory instruction sets.
// The single offset DS instructions have a 16-bit unsigned byte offset.
//
// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
// r + i with addr64. 32-bit has more addressing mode options. Depending on the
// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
//
// SMRD instructions have an 8-bit, dword offset.
//
bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
Type *Ty) const {
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
// Allow a 16-bit unsigned immediate field, since this is what DS instructions
// use.
if (!isUInt<16>(AM.BaseOffs))
return false;
// Only support r+r,
switch (AM.Scale) {
case 0: // "r+i" or just "i", depending on HasBaseReg.
break;
case 1:
if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
return false;
// Otherwise we have r+r or r+i.
break;
case 2:
if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
return false;
// Allow 2*r as r+r.
break;
default: // Don't allow n * r
return false;
}
return true;
}
bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
unsigned Align,

View File

@ -59,6 +59,10 @@ class SITargetLowering : public AMDGPUTargetLowering {
public:
SITargetLowering(TargetMachine &tm);
bool isLegalAddressingMode(const AddrMode &AM,
Type *Ty) const override;
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
unsigned Align,
bool *IsFast) const override;

View File

@ -0,0 +1,60 @@
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
declare i32 @llvm.r600.read.tidig.x() #0
declare void @llvm.AMDGPU.barrier.local() #1
; Function Attrs: nounwind
; SI-LABEL: @signed_ds_offset_addressing_loop
; SI: BB0_1:
; SI: V_ADD_I32_e32 [[VADDR:v[0-9]+]],
; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x0
; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x4
; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x80
; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x84
; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x100
; SI: S_ENDPGM
define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
entry:
%x.i = tail call i32 @llvm.r600.read.tidig.x() #0
%mul = shl nsw i32 %x.i, 1
br label %for.body
for.body: ; preds = %for.body, %entry
%sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
%offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
%k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
tail call void @llvm.AMDGPU.barrier.local() #1
%arrayidx = getelementptr inbounds float addrspace(3)* %lptr, i32 %offset.02
%tmp = load float addrspace(3)* %arrayidx, align 4
%add1 = add nsw i32 %offset.02, 1
%arrayidx2 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add1
%tmp1 = load float addrspace(3)* %arrayidx2, align 4
%add3 = add nsw i32 %offset.02, 32
%arrayidx4 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add3
%tmp2 = load float addrspace(3)* %arrayidx4, align 4
%add5 = add nsw i32 %offset.02, 33
%arrayidx6 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add5
%tmp3 = load float addrspace(3)* %arrayidx6, align 4
%add7 = add nsw i32 %offset.02, 64
%arrayidx8 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add7
%tmp4 = load float addrspace(3)* %arrayidx8, align 4
%add9 = fadd float %tmp, %tmp1
%add10 = fadd float %add9, %tmp2
%add11 = fadd float %add10, %tmp3
%add12 = fadd float %add11, %tmp4
%add13 = fadd float %sum.03, %add12
%inc = add nsw i32 %k.01, 1
%add14 = add nsw i32 %offset.02, 97
%exitcond = icmp eq i32 %inc, 8
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
%tmp5 = sext i32 %x.i to i64
%arrayidx15 = getelementptr inbounds float addrspace(1)* %out, i64 %tmp5
store float %add13, float addrspace(1)* %arrayidx15, align 4
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { noduplicate nounwind }
attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }