[AMDGPU] New tbuffer intrinsics

Summary:
This commit adds new intrinsics
  llvm.amdgcn.raw.tbuffer.load
  llvm.amdgcn.struct.tbuffer.load
  llvm.amdgcn.raw.tbuffer.store
  llvm.amdgcn.struct.tbuffer.store

with the following changes from the llvm.amdgcn.tbuffer.* intrinsics:

* there are separate raw and struct versions: raw does not have an index
  arg and sets idxen=0 in the instruction, and struct always sets
  idxen=1 in the instruction even if the index is 0, to allow for the
  fact that gfx9 does bounds checking differently depending on whether
  idxen is set;

* there is a combined format arg (dfmt+nfmt)

* there is a combined cachepolicy arg (glc+slc)

* there are now only two offset args: one for the offset that is
  included in bounds checking and swizzling, to be split between the
  instruction's voffset and immoffset fields, and one for the offset
  that is excluded from bounds checking and swizzling, to go into the
  instruction's soffset field.

The AMDISD::TBUFFER_* SD nodes always have an index operand, all three
offset operands, combined format operand, combined cachepolicy operand,
and an extra idxen operand.

The tbuffer pseudo- and real instructions now also have a combined
format operand.

The obsolescent llvm.amdgcn.tbuffer.* and llvm.SI.tbuffer.store
intrinsics continue to work.

V2: Separate raw and struct intrinsics.
V3: Moved extract_glc and extract_slc defs to a more sensible place.
V4: Rebased on D49995.
V5: Only two separate offset args instead of three.
V6: Pseudo- and real instructions have joint format operand.
V7: Restored optionality of dfmt and nfmt in assembler.
V8: Addressed minor review comments.

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D49026

Change-Id: If22ad77e349fac3a5d2f72dda53c010377d470d4
llvm-svn: 340268
This commit is contained in:
Tim Renouf 2018-08-21 11:06:05 +00:00
parent d378a39603
commit 35484c9d50
18 changed files with 971 additions and 106 deletions

View File

@ -815,6 +815,7 @@ class AMDGPUBufferStore : Intrinsic <
def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
def int_amdgcn_buffer_store : AMDGPUBufferStore;
// Obsolescent tbuffer intrinsics.
def int_amdgcn_tbuffer_load : Intrinsic <
[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
[llvm_v4i32_ty, // rsrc(SGPR)
@ -844,6 +845,54 @@ def int_amdgcn_tbuffer_store : Intrinsic <
[IntrWriteMem], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;
// New tbuffer intrinsics, with:
// - raw and struct variants
// - joint format field
// - joint cachepolicy field
def int_amdgcn_raw_tbuffer_load : Intrinsic <
[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
[llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
[IntrReadMem], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_tbuffer_store : Intrinsic <
[],
[llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
[IntrWriteMem], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;
def int_amdgcn_struct_tbuffer_load : Intrinsic <
[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
[llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
[IntrReadMem], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_tbuffer_store : Intrinsic <
[],
[llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
[IntrWriteMem], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;
class AMDGPUBufferAtomic : Intrinsic <
[llvm_i32_ty],
[llvm_i32_ty, // vdata(VGPR)

View File

@ -161,8 +161,7 @@ public:
ImmTyExpTgt,
ImmTyExpCompr,
ImmTyExpVM,
ImmTyDFMT,
ImmTyNFMT,
ImmTyFORMAT,
ImmTyHwreg,
ImmTyOff,
ImmTySendMsg,
@ -312,8 +311,7 @@ public:
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@ -666,8 +664,7 @@ public:
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyDFMT: OS << "DFMT"; break;
case ImmTyNFMT: OS << "NFMT"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
case ImmTyOModSI: OS << "OModSI"; break;
case ImmTyDppCtrl: OS << "DppCtrl"; break;
@ -1061,6 +1058,7 @@ public:
OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
OperandMatchResultTy parseDfmtNfmt(OperandVector &Operands);
void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
@ -3522,6 +3520,53 @@ AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
return MatchOperand_Success;
}
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
OperandMatchResultTy
AMDGPUAsmParser::parseDfmtNfmt(OperandVector &Operands) {
SMLoc S = Parser.getTok().getLoc();
int64_t Dfmt = 0, Nfmt = 0;
// dfmt and nfmt can appear in either order, and each is optional.
bool GotDfmt = false, GotNfmt = false;
while (!GotDfmt || !GotNfmt) {
if (!GotDfmt) {
auto Res = parseIntWithPrefix("dfmt", Dfmt);
if (Res != MatchOperand_NoMatch) {
if (Res != MatchOperand_Success)
return Res;
if (Dfmt >= 16) {
Error(Parser.getTok().getLoc(), "out of range dfmt");
return MatchOperand_ParseFail;
}
GotDfmt = true;
Parser.Lex();
continue;
}
}
if (!GotNfmt) {
auto Res = parseIntWithPrefix("nfmt", Nfmt);
if (Res != MatchOperand_NoMatch) {
if (Res != MatchOperand_Success)
return Res;
if (Nfmt >= 8) {
Error(Parser.getTok().getLoc(), "out of range nfmt");
return MatchOperand_ParseFail;
}
GotNfmt = true;
Parser.Lex();
continue;
}
}
break;
}
if (!GotDfmt && !GotNfmt)
return MatchOperand_NoMatch;
auto Format = Dfmt | Nfmt << 4;
Operands.push_back(
AMDGPUOperand::CreateImm(this, Format, S, AMDGPUOperand::ImmTyFORMAT));
return MatchOperand_Success;
}
//===----------------------------------------------------------------------===//
// ds
//===----------------------------------------------------------------------===//
@ -4617,8 +4662,7 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOffset);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
@ -4761,8 +4805,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
{"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr},
{"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr},
{"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
@ -4844,6 +4887,8 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
Op.Type == AMDGPUOperand::ImmTyNegHi) {
res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
Op.ConvertResult);
} else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) {
res = parseDfmtNfmt(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
}

View File

@ -100,15 +100,11 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
bits<1> glc_value = 0; // the value for glc if no such operand
bits<4> dfmt_value = 1; // the value for dfmt if no such operand
bits<3> nfmt_value = 0; // the value for nfmt if no such operand
bits<1> has_srsrc = 1;
bits<1> has_soffset = 1;
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
bits<1> has_dfmt = 1;
bits<1> has_nfmt = 1;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
@ -126,14 +122,16 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
bits<4> dfmt;
bits<3> nfmt;
bits<7> format;
bits<8> vaddr;
bits<8> vdata;
bits<7> srsrc;
bits<1> slc;
bits<1> tfe;
bits<8> soffset;
bits<4> dfmt = format{3-0};
bits<3> nfmt = format{6-4};
}
class getMTBUFInsDA<list<RegisterClass> vdataList,
@ -142,16 +140,16 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe),
offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe)
offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
SLC:$slc, TFE:$tfe),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
SLC:$slc, TFE:$tfe)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
@ -169,15 +167,15 @@ class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
class getMTBUFAsmOps<int addrKind> {
string Pfx =
!if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
!if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $format, $soffset",
!if(!eq(addrKind, BUFAddrKind.OffEn),
"$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
"$vaddr, $srsrc, $format, $soffset offen",
!if(!eq(addrKind, BUFAddrKind.IdxEn),
"$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
"$vaddr, $srsrc, $format, $soffset idxen",
!if(!eq(addrKind, BUFAddrKind.BothEn),
"$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
"$vaddr, $srsrc, $format, $soffset idxen offen",
!if(!eq(addrKind, BUFAddrKind.Addr64),
"$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
"$vaddr, $srsrc, $format, $soffset addr64",
"")))));
string ret = Pfx # "$offset";
}
@ -217,14 +215,14 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
i1:$glc, i1:$slc, i1:$tfe)))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@ -263,13 +261,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
i16:$offset, i8:$format, i1:$glc,
i1:$slc, i1:$tfe))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
i16:$offset, i8:$format, i1:$glc,
i1:$slc, i1:$tfe))]>,
MTBUFAddr64Table<1, NAME>;
@ -1030,6 +1028,14 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
// MUBUF Patterns
//===----------------------------------------------------------------------===//
def extract_glc : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
}]>;
def extract_slc : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
}]>;
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
@ -1524,32 +1530,36 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
(as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
imm:$format, imm:$cachepolicy, imm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
(as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
(as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
imm:$format, imm:$cachepolicy, imm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
(as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
}
@ -1576,39 +1586,36 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $dfmt),
(as_i8imm $nfmt), (as_i1imm $glc),
(as_i1imm $slc), 0)
(as_i16imm $offset), (as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
imm:$format, imm:$cachepolicy, imm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $dfmt),
(as_i8imm $nfmt), (as_i1imm $glc),
(as_i1imm $slc), 0)
(as_i16imm $offset), (as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $dfmt),
(as_i8imm $nfmt), (as_i1imm $glc),
(as_i1imm $slc), 0)
(as_i16imm $offset), (as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
imm:$offset, imm:$format, imm:$cachepolicy, imm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
$rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
(extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
}
@ -1781,8 +1788,8 @@ class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{15} = ps.addr64;
let Inst{18-16} = op;
let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
let Inst{22-19} = dfmt;
let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@ -1811,6 +1818,7 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
//===----------------------------------------------------------------------===//
// CI
// MTBUF - GFX6, GFX7.
//===----------------------------------------------------------------------===//
class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
@ -2013,8 +2021,8 @@ class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{18-15} = op;
let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
let Inst{22-19} = dfmt;
let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@ -2043,8 +2051,8 @@ class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{18-15} = op;
let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
let Inst{22-19} = dfmt;
let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);

View File

@ -236,21 +236,12 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
O << " vm";
}
void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
O << " dfmt:";
printU8ImmDecOperand(MI, OpNo, O);
}
}
void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
O << " nfmt:";
printU8ImmDecOperand(MI, OpNo, O);
void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (unsigned Val = MI->getOperand(OpNo).getImm()) {
O << " dfmt:" << (Val & 15);
O << ", nfmt:" << (Val >> 4);
}
}

View File

@ -90,10 +90,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpVM(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printDFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printNFMT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printRegOperand(unsigned RegNo, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,

View File

@ -5158,6 +5158,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned IdxEn = 1;
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@ -5165,10 +5172,57 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(4), // voffset
Op.getOperand(5), // soffset
Op.getOperand(6), // offset
Op.getOperand(7), // dfmt
Op.getOperand(8), // nfmt
Op.getOperand(9), // glc
Op.getOperand(10) // slc
DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT,
M->getMemOperand());
}
case Intrinsic::amdgcn_raw_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(4), // soffset
Offsets.second, // offset
Op.getOperand(5), // format
Op.getOperand(6), // cachepolicy
DAG.getConstant(0, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT,
M->getMemOperand());
}
case Intrinsic::amdgcn_struct_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy
DAG.getConstant(1, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@ -5407,6 +5461,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto Opcode = NumChannels->getZExtValue() == 3 ?
AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(12))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(13))->getZExtValue();
SDValue Ops[] = {
Chain,
Op.getOperand(3), // vdata
@ -5415,10 +5473,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VOffset,
Op.getOperand(6), // soffset
Op.getOperand(7), // inst_offset
Op.getOperand(8), // dfmt
Op.getOperand(9), // nfmt
Op.getOperand(12), // glc
Op.getOperand(13), // slc
DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen
};
assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
@ -5438,6 +5495,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
unsigned IdxEn = 1;
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Chain,
VData, // vdata
@ -5446,10 +5510,59 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(5), // voffset
Op.getOperand(6), // soffset
Op.getOperand(7), // offset
Op.getOperand(8), // dfmt
Op.getOperand(9), // nfmt
Op.getOperand(10), // glc
Op.getOperand(11) // slc
DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
MemSDNode *M = cast<MemSDNode>(Op);
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_struct_tbuffer_store: {
SDValue VData = Op.getOperand(2);
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
Chain,
VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy
DAG.getConstant(1, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
MemSDNode *M = cast<MemSDNode>(Op);
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_raw_tbuffer_store: {
SDValue VData = Op.getOperand(2);
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Chain,
VData, // vdata
Op.getOperand(3), // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy
DAG.getConstant(0, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@ -5490,6 +5603,50 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
}
// The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset
// (the offset that is included in bounds checking and swizzling, to be split
// between the instruction's voffset and immoffset fields) and soffset (the
// offset that is excluded from bounds checking and swizzling, to go in the
// instruction's soffset field). This function takes the first kind of offset
// and figures out how to split it between voffset and immoffset.
std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
SDValue Offset, SelectionDAG &DAG) const {
SDLoc DL(Offset);
const unsigned MaxImm = 4095;
SDValue N0 = Offset;
ConstantSDNode *C1 = nullptr;
if (N0.getOpcode() == ISD::ADD) {
if ((C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))))
N0 = N0.getOperand(0);
} else if ((C1 = dyn_cast<ConstantSDNode>(N0)))
N0 = SDValue();
if (C1) {
unsigned ImmOffset = C1->getZExtValue();
// If the immediate value is too big for the immoffset field, put the value
// mod 4096 into the immoffset field so that the value that is copied/added
// for the voffset field is a multiple of 4096, and it stands more chance
// of being CSEd with the copy/add for another similar load/store.
unsigned Overflow = ImmOffset & ~MaxImm;
ImmOffset -= Overflow;
C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
if (Overflow) {
auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
if (!N0)
N0 = OverflowVal;
else {
SDValue Ops[] = { N0, OverflowVal };
N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
}
}
}
if (!N0)
N0 = DAG.getConstant(0, DL, MVT::i32);
if (!C1)
C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
return {N0, SDValue(C1, 0)};
}
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
ISD::LoadExtType ExtType, SDValue Op,
const SDLoc &SL, EVT VT) {

View File

@ -65,6 +65,15 @@ private:
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
// The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset
// (the offset that is included in bounds checking and swizzling, to be split
// between the instruction's voffset and immoffset fields) and soffset (the
// offset that is excluded from bounds checking and swizzling, to go in the
// instruction's soffset field). This function takes the first kind of
// offset and figures out how to split it between voffset and immoffset.
std::pair<SDValue, SDValue> splitBufferOffsets(SDValue Offset,
SelectionDAG &DAG) const;
SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;

View File

@ -69,36 +69,34 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
def SDTbuffer_load : SDTypeProfile<1, 9,
def SDTtbuffer_load : SDTypeProfile<1, 8,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
SDTCisVT<2, i32>, // vindex(VGPR)
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // dfmt(imm)
SDTCisVT<7, i32>, // nfmt(imm)
SDTCisVT<8, i32>, // glc(imm)
SDTCisVT<9, i32> // slc(imm)
SDTCisVT<6, i32>, // format(imm)
SDTCisVT<7, i32>, // cachecontrol(imm)
SDTCisVT<8, i1> // idxen(imm)
]>;
def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load,
def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTtbuffer_load,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16",
SDTbuffer_load,
SDTtbuffer_load,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
def SDTtbuffer_store : SDTypeProfile<0, 10,
def SDTtbuffer_store : SDTypeProfile<0, 9,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
SDTCisVT<2, i32>, // vindex(VGPR)
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // dfmt(imm)
SDTCisVT<7, i32>, // nfmt(imm)
SDTCisVT<8, i32>, // glc(imm)
SDTCisVT<9, i32> // slc(imm)
SDTCisVT<6, i32>, // format(imm)
SDTCisVT<7, i32>, // cachecontrol(imm)
SDTCisVT<8, i1> // idxen(imm)
]>;
def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
@ -752,8 +750,7 @@ def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;
def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;

View File

@ -0,0 +1,42 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; GCN-LABEL: {{^}}tbuffer_load_d16_x:
; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
main_body:
%data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0)
ret half %data
}
; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
main_body:
%data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0)
%elt = extractelement <2 x half> %data, i32 1
ret half %elt
}
; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
main_body:
%data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0)
%elt = extractelement <4 x half> %data, i32 3
ret half %elt
}
declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32)
declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32)

View File

@ -0,0 +1,92 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}tbuffer_load:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0
; GCN: s_waitcnt
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 78, i32 0)
%vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 63, i32 1)
%vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 2)
%vdata_f32 = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
%vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
%vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
%r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3
ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3
}
; GCN-LABEL: {{^}}tbuffer_load_immoffs:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42
define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}tbuffer_load_immoffs_large
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1
; GCN: s_waitcnt
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) {
%vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0)
%vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 73, i32 %soffs, i32 62, i32 0)
%vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 1, i32 %soffs, i32 77, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
%vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
%vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}
; GCN-LABEL: {{^}}tbuffer_load_ofs:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen
define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}tbuffer_load_ofs_imm:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52
define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) {
main_body:
%ofs = add i32 %voffs, 52
%vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}buffer_load_xy:
; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
%vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0)
%vdata.f = bitcast <2 x i32> %vdata to <2 x float>
ret <2 x float> %vdata.f
}
; GCN-LABEL: {{^}}buffer_load_x:
; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
%vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0)
%vdata.f = bitcast i32 %vdata to float
ret float %vdata.f
}
declare i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32)
declare <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32)

View File

@ -0,0 +1,57 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}tbuffer_store_d16_x:
; GCN: s_load_dwordx4
; GCN: s_load_dword s[[S_LO:[0-9]+]]
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
; GCN: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
main_body:
call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) {
main_body:
call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) {
main_body:
call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
}
declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32)

View File

@ -0,0 +1,75 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}tbuffer_store:
; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0
; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc
; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc
; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0
define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
main_body:
%in1 = bitcast <4 x float> %1 to <4 x i32>
%in2 = bitcast <4 x float> %2 to <4 x i32>
%in3 = bitcast <4 x float> %3 to <4 x i32>
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 44, i32 0)
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 61, i32 1)
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 2)
call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_immoffs:
; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42
define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
main_body:
%in1 = bitcast <4 x float> %1 to <4 x i32>
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 0, i32 117, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42
define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 %soffset, i32 117, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_ofs:
; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen
define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %voffset, i32 0, i32 115, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_x1:
; GCN: tbuffer_store_format_x v0, off, s[0:3], dfmt:13, nfmt:7, 0
define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) {
main_body:
%data.i = bitcast float %data to i32
call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 125, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_x2:
; GCN: tbuffer_store_format_xy v[0:1], off, s[0:3], dfmt:1, nfmt:2, 0
define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) {
main_body:
%data.i = bitcast <2 x float> %data to <2 x i32>
call void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
}
declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i1, i1) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }

View File

@ -0,0 +1,45 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; GCN-LABEL: {{^}}tbuffer_load_d16_x:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
main_body:
%data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0)
ret half %data
}
; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
main_body:
%data = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0)
%elt = extractelement <2 x half> %data, i32 1
ret half %elt
}
; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen
; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
main_body:
%data = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0)
%elt = extractelement <4 x half> %data, i32 3
ret half %elt
}
declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32)
declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32)

View File

@ -0,0 +1,114 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}tbuffer_load:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 idxen glc
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen slc
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen
; GCN: s_waitcnt
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0)
%vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 63, i32 1)
%vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 2)
%vdata_f32 = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
%vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
%vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
%r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3
ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3
}
; GCN-LABEL: {{^}}tbuffer_load_immoffs:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42
define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}tbuffer_load_immoffs_large
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 idxen offset:4095
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} idxen offset:73
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} idxen offset:1
; GCN: s_waitcnt
define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) {
%vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0)
%vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0)
%vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 1, i32 %soffs, i32 77, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
%vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
%vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
ret {<4 x float>, <4 x float>, <4 x float>} %r2
}
; GCN-LABEL: {{^}}tbuffer_load_idx:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen
define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}tbuffer_load_ofs:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen
define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}tbuffer_load_ofs_imm:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen offset:52
define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) {
main_body:
%ofs = add i32 %voffs, 52
%vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}tbuffer_load_both:
; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen
define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) {
main_body:
%vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0)
%vdata.f = bitcast <4 x i32> %vdata to <4 x float>
ret <4 x float> %vdata.f
}
; GCN-LABEL: {{^}}buffer_load_xy:
; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen
define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
%vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0)
%vdata.f = bitcast <2 x i32> %vdata to <2 x float>
ret <2 x float> %vdata.f
}
; GCN-LABEL: {{^}}buffer_load_x:
; GCN: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen
define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
%vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0)
%vdata.f = bitcast i32 %vdata to float
ret float %vdata.f
}
declare i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32)
declare <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32)

View File

@ -0,0 +1,57 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}tbuffer_store_d16_x:
; GCN: s_load_dwordx4
; GCN: s_load_dword{{[x0-9]*}} s{{\[}}[[S_LO:[0-9]+]]
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
ret void
}
declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32)

View File

@ -0,0 +1,114 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}tbuffer_store:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:12, nfmt:2, 0 idxen
; GCN: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], dfmt:13, nfmt:3, 0 idxen glc
; GCN: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen slc
; GCN: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen
define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
main_body:
%in1 = bitcast <4 x float> %1 to <4 x i32>
%in2 = bitcast <4 x float> %2 to <4 x i32>
%in3 = bitcast <4 x float> %3 to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 44, i32 0)
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 61, i32 1)
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 2)
call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_immoffs:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42
define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
main_body:
%in1 = bitcast <4 x float> %1 to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 117, i32 0)
ret void
}
; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} idxen offset:42
define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 %soffset, i32 117, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_idx:
; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen
define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 47, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_ofs:
; GCN: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], dfmt:3, nfmt:7, 0 idxen offen
define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 115, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_both:
; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen
define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 70, i32 0)
ret void
}
; Ideally, the register allocator would avoid the wait here
;
; GCN-LABEL: {{^}}buffer_store_wait:
; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen
; VERDE: s_waitcnt expcnt(0)
; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
; GCN: s_waitcnt vmcnt(0)
; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen
define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
%data.i = bitcast <4 x float> %data to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_x1:
; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen
define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
main_body:
%data.i = bitcast float %data to i32
call void @llvm.amdgcn.struct.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 125, i32 0)
ret void
}
; GCN-LABEL: {{^}}buffer_store_x2:
; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen
define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) {
main_body:
%data.i = bitcast <2 x float> %data to <2 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
ret void
}
declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }

View File

@ -70,14 +70,14 @@ main_body:
; VERDE: s_waitcnt expcnt(0)
; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
; GCN: s_waitcnt vmcnt(0)
; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen
; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen
define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) {
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
%data.i = bitcast <4 x float> %data to <4 x i32>
call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0)
call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0)
ret void
}

View File

@ -38,3 +38,18 @@ tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1
// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71]
// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71]
// nfmt is optional:
tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, ttmp1
// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe8,0x00,0x01,0x1d,0x71]
// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71]
// dfmt is optional:
tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, ttmp1
// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x07,0xe9,0x00,0x01,0x1d,0x71]
// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x01,0x1d,0x71]
// nfmt and dfmt can be in either order:
tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, dfmt:15, ttmp1
// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71]
// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71]