[AMDGPU] gfx1010: use fmac instructions

Differential Revision: https://reviews.llvm.org/D61527

llvm-svn: 359959
This commit is contained in:
Stanislav Mekhanoshin 2019-05-04 04:20:37 +00:00
parent 37be3363b5
commit 28a1936f6d
11 changed files with 1004 additions and 229 deletions

View File

@ -521,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
if (!Subtarget->hasFP16Denormals())
if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
@ -8723,8 +8723,10 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
(VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
(VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
getSubtarget()->hasMadF16())) &&
isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;

View File

@ -2071,7 +2071,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}
if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@ -2086,7 +2088,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (isInlineConstant(UseMI, *Src0, *ImmOp))
return false;
bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@ -2099,6 +2104,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
return false;
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
// We need to swap operands 0 and 1 since madmk constant is at operand 1.
const int64_t Imm = ImmOp->getImm();
@ -2119,14 +2130,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(Src1->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAC_F16_e64)
Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Src1->ChangeToImmediate(Imm);
removeModOperands(UseMI);
UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
UseMI.setDesc(get(NewOpc));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@ -2176,6 +2189,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// VGPR is okay as Src1 - fallthrough
}
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
const int64_t Imm = ImmOp->getImm();
// FIXME: This would be a lot easier if we could return a new instruction
@ -2188,7 +2207,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
if (Opc == AMDGPU::V_MAC_F32_e64 ||
Opc == AMDGPU::V_MAC_F16_e64)
Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@ -2197,7 +2218,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// These come before src2.
removeModOperands(UseMI);
UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
UseMI.setDesc(get(NewOpc));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@ -2310,18 +2331,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
LiveVariables *LV) const {
unsigned Opc = MI.getOpcode();
bool IsF16 = false;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_FMAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
@ -2350,32 +2374,38 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 ||
!Src0->isReg() ||
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
return BuildMI(*MBB, MI, MI.getDebugLoc(),
get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
.add(*Dst)
.add(*Src0)
.add(*Src1)
.addImm(Imm);
unsigned NewOpc =
IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
if (pseudoToMCOpcode(NewOpc) != -1)
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src0)
.add(*Src1)
.addImm(Imm);
}
unsigned NewOpc =
IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (auto Imm = getFoldableImm(Src1)) {
return BuildMI(*MBB, MI, MI.getDebugLoc(),
get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
.add(*Dst)
.add(*Src0)
.addImm(Imm)
.add(*Src2);
if (pseudoToMCOpcode(NewOpc) != -1)
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src0)
.addImm(Imm)
.add(*Src2);
}
if (auto Imm = getFoldableImm(Src0)) {
if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
if (pseudoToMCOpcode(NewOpc) != -1 &&
isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
AMDGPU::OpName::src0), Src1))
return BuildMI(*MBB, MI, MI.getDebugLoc(),
get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src1)
.addImm(Imm)
@ -2383,9 +2413,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
}
}
assert((!IsFMA || !IsF16) && "fmac only expected with f32");
unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
: (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
@ -2678,6 +2710,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
@ -3410,13 +3443,16 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MachineBasicBlock *MBB = MI.getParent();
MachineOperand &MO = MI.getOperand(OpIdx);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
unsigned Size = TRI->getRegSizeInBits(*RC);
unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
if (MO.isReg())
Opcode = AMDGPU::COPY;
else if (RI.isSGPRClass(RC))
Opcode = AMDGPU::S_MOV_B32;
Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
@ -5332,6 +5368,12 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
}
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
return (16ULL << 44) | // IMG_FORMAT_32_FLOAT
(1ULL << 56) | // RESOURCE_LEVEL = 1
(3ULL << 60); // OOB_SELECT = 3
}
uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
if (ST.isAmdHsaOS()) {
// Set ATC = 1. GFX9 doesn't have this bit.
@ -5358,12 +5400,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}
// IndexStride = 64.
Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
// IndexStride = 64 / 32.
uint64_t IndexStride = ST.getGeneration() <= AMDGPUSubtarget::GFX9 ? 3 : 2;
Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
ST.getGeneration() <= AMDGPUSubtarget::GFX9)
Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
return Rsrc23;

View File

@ -1462,7 +1462,7 @@ def : GCNPat<
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
(V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
(V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
}
@ -1523,6 +1523,14 @@ def : GCNPat <
>;
} // End OtherPredicates = [HasDLInsts]
let SubtargetPredicate = isGFX10Plus in
def : GCNPat <
(fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
(f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(f16 (VOP3NoMods f32:$src2))),
(V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
SRCMODS.NONE, $src2, $clamp, $omod)
>;
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<

View File

@ -418,7 +418,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
}
assert(Src && Src->isReg());
if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
!isSameReg(*Src, *getReplacedOperand())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
@ -460,7 +462,9 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
getDstSel() != AMDGPU::SDWA::DWORD) {
// v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
@ -964,10 +968,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return false;
}
if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
Opc == AMDGPU::V_FMAC_F32_e32 ||
Opc == AMDGPU::V_MAC_F16_e32 ||
Opc == AMDGPU::V_MAC_F32_e32))
return false;
// Check if target supports this SDWA opcode
if (TII->pseudoToMCOpcode(Opc) == -1)
return false;
// FIXME: has SDWA but require handling of implicit VCC use
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;
@ -1038,7 +1048,9 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Src1);
}
if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
// v_mac_f16/32 has additional src2 operand tied to vdst
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);

View File

@ -1,7 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
; Make sure fdiv is promoted to f32.
@ -21,17 +22,17 @@
; SI: v_div_fixup_f32
; SI: v_cvt_f16_f32
; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
; GFX8_9-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
; GFX8_9: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
; GFX8_9: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
; GFX8_9: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
; GFX8_9_10: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16(
half addrspace(1)* %r,
half addrspace(1)* %a,
@ -50,11 +51,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16:
; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9-NOT: [[VAL]]
; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9-NOT: [[RESULT]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9_10-NOT: [[RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -68,11 +69,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16_abs:
; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9-NOT: [[VAL]]
; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
; GFX8_9-NOT: [RESULT]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
; GFX8_9_10-NOT: [RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -87,11 +88,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16_arcp:
; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9-NOT: [[VAL]]
; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9-NOT: [[RESULT]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9_10-NOT: [[RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -105,11 +106,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16_neg:
; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9-NOT: [[VAL]]
; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
; GFX8_9-NOT: [RESULT]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
; GFX8_9_10-NOT: [RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -123,11 +124,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rsq_f16:
; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9-NOT: [[VAL]]
; GFX8_9: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9-NOT: [RESULT]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9_10-NOT: [RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -142,12 +143,12 @@ entry:
}
; GCN-LABEL: {{^}}v_rsq_f16_neg:
; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9-NOT: [[VAL]]
; GFX8_9: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
; GFX8_9-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
; GFX8_9-NOT: [RESULT]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
; GFX8_9_10-NOT: [RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -162,13 +163,13 @@ entry:
}
; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -184,13 +185,13 @@ entry:
}
; GCN-LABEL: {{^}}v_fdiv_f16_unsafe:
; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -208,8 +209,8 @@ entry:
; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
; GFX8_9: buffer_store_short [[MUL]]
; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
%rcp = fdiv arcp half %x, 2.0
@ -220,8 +221,8 @@ define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
; GFX8_9: buffer_store_short [[MUL]]
; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
%rcp = fdiv arcp half %x, 10.0
@ -232,8 +233,8 @@ define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
; GFX8_9: buffer_store_short [[MUL]]
; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
%rcp = fdiv arcp half %x, -10.0

View File

@ -0,0 +1,76 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
; GCN-LABEL: {{^}}addMul2D:
; GFX1010: v_fmac_f16
; GFX1010: v_fmac_f16
define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 {
%5 = extractelement <2 x i32> %2, i64 1
%6 = icmp sgt i32 %5, 0
br i1 %6, label %7, label %38
7: ; preds = %4
%8 = extractelement <2 x i32> %2, i64 0
%9 = icmp sgt i32 %8, 0
br label %10
10: ; preds = %34, %7
%11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ]
%12 = phi i32 [ 0, %7 ], [ %36, %34 ]
br i1 %9, label %13, label %34
13: ; preds = %10
%14 = mul nsw i32 %12, %3
%15 = mul nsw i32 %12, %8
br label %16
16: ; preds = %16, %13
%17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ]
%18 = phi i32 [ 0, %13 ], [ %32, %16 ]
%19 = add nsw i32 %18, %14
%20 = sext i32 %19 to i64
%21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20
%22 = load <4 x i8>, <4 x i8>* %21, align 4
%23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8
%24 = add nsw i32 %18, %15
%25 = sext i32 %24 to i64
%26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25
%27 = load float, float addrspace(4)* %26, align 4
%28 = fptrunc float %27 to half
%29 = insertelement <4 x half> undef, half %28, i32 0
%30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer
%31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17)
%32 = add nuw nsw i32 %18, 1
%33 = icmp eq i32 %32, %8
br i1 %33, label %34, label %16
34: ; preds = %16, %10
%35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ]
%36 = add nuw nsw i32 %12, 1
%37 = icmp eq i32 %36, %5
br i1 %37, label %38, label %10
38: ; preds = %34, %4
%39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ]
ret <4 x half> %39
}
define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 {
%2 = extractelement <4 x i8> %0, i64 0
%3 = uitofp i8 %2 to half
%4 = insertelement <4 x half> undef, half %3, i32 0
%5 = extractelement <4 x i8> %0, i64 1
%6 = uitofp i8 %5 to half
%7 = insertelement <4 x half> %4, half %6, i32 1
%8 = extractelement <4 x i8> %0, i64 2
%9 = uitofp i8 %8 to half
%10 = insertelement <4 x half> %7, half %9, i32 2
%11 = extractelement <4 x i8> %0, i64 3
%12 = uitofp i8 %11 to half
%13 = insertelement <4 x half> %10, half %12, i32 3
ret <4 x half> %13
}
declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>)
attributes #0 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+dl-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx9-insts,+s-memrealtime,-code-object-v3,-sram-ecc,-xnack" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fp64-fp16-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }

View File

@ -1,8 +1,13 @@
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare half @llvm.fmuladd.f16(half, half, half) #1
@ -12,6 +17,11 @@ declare half @llvm.fabs.f16(half) #1
; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
; GFX10-FLUSH: v_mul_f16_e32
; GFX10-FLUSH: v_add_f16_e32
; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
%r0 = load half, half addrspace(1)* %in1
@ -23,13 +33,21 @@ define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1
}
; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@ -45,13 +63,21 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half add
}
; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@ -67,17 +93,25 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half add
}
; GCN-LABEL: {{^}}fadd_a_a_b_f16:
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
half addrspace(1)* %in1,
half addrspace(1)* %in2) #0 {
@ -96,17 +130,25 @@ define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
}
; GCN-LABEL: {{^}}fadd_b_a_a_f16:
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
half addrspace(1)* %in1,
half addrspace(1)* %in2) #0 {
@ -125,11 +167,17 @@ define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
}
; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@ -145,13 +193,20 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half
}
; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@ -169,13 +224,20 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out,
}
; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@ -193,11 +255,14 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half
}
; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@ -215,18 +280,22 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half
}
; GCN-LABEL: {{^}}mad_sub_f16:
; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@ -246,17 +315,23 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out
}
; GCN-LABEL: {{^}}mad_sub_inv_f16:
; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@ -276,17 +351,21 @@ define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture
}
; GCN-LABEL: {{^}}mad_sub_fabs_f16:
; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@ -307,18 +386,22 @@ define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture
}
; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@ -339,18 +422,24 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap
}
; GCN-LABEL: {{^}}neg_neg_mad_f16:
; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@ -372,18 +461,22 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture
}
; GCN-LABEL: {{^}}mad_fabs_sub_f16:
; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@ -404,17 +497,24 @@ define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture
}
; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@ -432,17 +532,21 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half add
}
; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid

View File

@ -2,6 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-FLUSH %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-DENORM %s
declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@ -23,6 +25,13 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half>
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
; VI-DENORM: buffer_store_short [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]]
; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
; GFX10-FLUSH: buffer_store_short [[ADD]]
; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
; GFX10-DENORM: buffer_store_short v[[C_F16]],
; GCN: s_endpgm
define amdgpu_kernel void @fmuladd_f16(
half addrspace(1)* %r,
@ -53,6 +62,13 @@ define amdgpu_kernel void @fmuladd_f16(
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
; VI-DENORM: buffer_store_short [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]]
; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
; GFX10-FLUSH: buffer_store_short [[ADD]]
; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
; GFX10-DENORM: buffer_store_short v[[C_F16]],
; GCN: s_endpgm
define amdgpu_kernel void @fmuladd_f16_imm_a(
half addrspace(1)* %r,
@ -81,6 +97,12 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
; VI-DENORM buffer_store_short [[RESULT]]
; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]]
; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
; GFX10-FLUSH: buffer_store_short [[ADD]]
; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
; GFX10-DENORM buffer_store_short v[[C_F16]],
; GCN: s_endpgm
define amdgpu_kernel void @fmuladd_f16_imm_b(
@ -107,6 +129,9 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; GFX10: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GFX10: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@ -116,7 +141,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
@ -127,7 +151,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
@ -144,6 +167,11 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; VI-DENORM-NOT: v_and_b32
; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
; GFX10-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; GFX10-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]]
; GFX10-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; GCN: buffer_store_dword v[[R_V2_F16]]
define amdgpu_kernel void @fmuladd_v2f16(
<2 x half> addrspace(1)* %r,

View File

@ -1,6 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,MAD,GFX10-MAD %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
@ -12,7 +14,10 @@ declare float @llvm.fabs.f32(float) nounwind readnone
; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@ -33,17 +38,20 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
; it.
; GCN-LABEL: {{^}}madak_2_use_f32:
; GFX8_9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX8_9: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
; GCN: s_endpgm
; GFX8_9_10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
; GCN: s_endpgm
define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@ -70,7 +78,8 @@ define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, flo
; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@ -94,7 +103,10 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %o
; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@ -112,11 +124,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out
; We can't use an SGPR when forming madak
; GCN-LABEL: {{^}}s_v_madak_f32:
; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
; FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@ -131,11 +145,13 @@ define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float
}
; GCN-LABEL: @v_s_madak_f32
; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
; GFX6_8_9-NOT: v_madak_f32
; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
; FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@ -151,7 +167,9 @@ define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float
; GCN-LABEL: {{^}}s_s_madak_f32:
; GCN-NOT: v_madak_f32
; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
; FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
%mul = fmul float %a, %b
%madak = fadd float %mul, 10.0
@ -160,12 +178,14 @@ define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, flo
}
; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
; GFX6: buffer_load_dword [[VA:v[0-9]+]]
; GFX6: buffer_load_dword [[VB:v[0-9]+]]
; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
; GCN: s_endpgm
; GFX6: buffer_load_dword [[VA:v[0-9]+]]
; GFX6: buffer_load_dword [[VB:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
; FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
; GCN: s_endpgm
define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@ -184,12 +204,14 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalia
}
; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
; GFX6: buffer_load_dword [[VA:v[0-9]+]]
; GFX6: buffer_load_dword [[VB:v[0-9]+]]
; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
; GCN: s_endpgm
; GFX6: buffer_load_dword [[VA:v[0-9]+]]
; GFX6: buffer_load_dword [[VB:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
; FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
; GCN: s_endpgm
define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@ -207,16 +229,18 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
ret void
}
; SIFoldOperands should not fold the SGPR copy into the instruction
; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
; because the implicit immediate already uses the constant bus.
; On GFX10+ we can use two scalar operands.
; GCN-LABEL: {{^}}madak_constant_bus_violation:
; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
; GFX6: buffer_store_dword [[MUL]]
; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
; MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
; GFX6: buffer_store_dword [[MUL]]
; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
bb:
%tmp = icmp eq i32 %arg1, 0

View File

@ -0,0 +1,293 @@
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1010 -check-prefix=GCN %s
# GCN-LABEL: {{^}}name: vop1_instructions
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 6, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $exec
---
name: vop1_instructions
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vreg_64 }
- { id: 2, class: sreg_64 }
- { id: 3, class: vgpr_32 }
- { id: 4, class: sreg_32_xm0 }
- { id: 5, class: sreg_32_xm0 }
- { id: 6, class: sreg_32_xm0 }
- { id: 7, class: sreg_32_xm0 }
- { id: 8, class: sreg_32 }
- { id: 9, class: vgpr_32 }
- { id: 10, class: vgpr_32 }
- { id: 11, class: vgpr_32 }
- { id: 12, class: vgpr_32 }
- { id: 13, class: vgpr_32 }
- { id: 14, class: vgpr_32 }
- { id: 15, class: vgpr_32 }
- { id: 16, class: vgpr_32 }
- { id: 17, class: vgpr_32 }
- { id: 18, class: vgpr_32 }
- { id: 19, class: vgpr_32 }
- { id: 20, class: vgpr_32 }
- { id: 21, class: vgpr_32 }
- { id: 22, class: vgpr_32 }
- { id: 23, class: vgpr_32 }
- { id: 24, class: vgpr_32 }
- { id: 25, class: vgpr_32 }
- { id: 26, class: vgpr_32 }
- { id: 27, class: vgpr_32 }
- { id: 28, class: vgpr_32 }
- { id: 29, class: vgpr_32 }
- { id: 30, class: vgpr_32 }
- { id: 31, class: vgpr_32 }
- { id: 32, class: vgpr_32 }
- { id: 33, class: vgpr_32 }
- { id: 34, class: vgpr_32 }
- { id: 35, class: vgpr_32 }
- { id: 36, class: vgpr_32 }
- { id: 37, class: vgpr_32 }
- { id: 38, class: vgpr_32 }
- { id: 39, class: vgpr_32 }
- { id: 40, class: vgpr_32 }
- { id: 41, class: vgpr_32 }
- { id: 42, class: vgpr_32 }
- { id: 43, class: vgpr_32 }
- { id: 44, class: vgpr_32 }
- { id: 45, class: vgpr_32 }
- { id: 46, class: vgpr_32 }
- { id: 47, class: vgpr_32 }
- { id: 48, class: vgpr_32 }
- { id: 100, class: vgpr_32 }
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
%2 = COPY $sgpr30_sgpr31
%1 = COPY $vgpr2_vgpr3
%0 = COPY $vgpr0_vgpr1
%3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
%5 = S_MOV_B32 65535
%6 = S_MOV_B32 65535
%10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
%11 = V_MOV_B32_e32 %10, implicit $exec
%12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
%14 = V_FRACT_F32_e32 123, implicit $exec
%15 = V_LSHLREV_B32_e64 16, %14, implicit $exec
%16 = V_LSHRREV_B32_e64 16, %15, implicit $exec
%17 = V_SIN_F32_e32 %16, implicit $exec
%18 = V_LSHLREV_B32_e64 16, %17, implicit $exec
%19 = V_LSHRREV_B32_e64 16, %18, implicit $exec
%20 = V_CVT_U32_F32_e32 %19, implicit $exec
%21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
%23 = V_CVT_F32_I32_e32 123, implicit $exec
%24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
%25 = V_LSHRREV_B32_e64 16, %3, implicit $exec
%26 = V_MOV_B32_e64 %25, implicit $exec
%26 = V_LSHLREV_B32_e64 16, %26, implicit $exec
%27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $exec
%28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
%29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
%30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $exec
%31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
%32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
%33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $exec
%34 = V_LSHLREV_B32_e64 16, %33, implicit $exec
%35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $exec
%36 = V_LSHLREV_B32_e64 16, %35, implicit $exec
%37 = V_LSHRREV_B32_e64 16, %36, implicit $exec
%38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $exec
%39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
%40 = V_LSHRREV_B32_e64 16, %39, implicit $exec
%41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $exec
%42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
%43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
%44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $exec
%45 = V_LSHLREV_B32_e64 16, %44, implicit $exec
%46 = V_LSHRREV_B32_e64 16, %45, implicit $exec
%47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $exec
%48 = V_LSHLREV_B32_e64 16, %47, implicit $exec
%100 = V_MOV_B32_e32 %48, implicit $exec
FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
$sgpr30_sgpr31 = COPY %2
S_SETPC_B64_return $sgpr30_sgpr31
...
---
# GCN-LABEL: {{^}}name: vop2_instructions
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $exec
name: vop2_instructions
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vreg_64 }
- { id: 2, class: sreg_64 }
- { id: 3, class: vgpr_32 }
- { id: 4, class: sreg_32_xm0 }
- { id: 5, class: sreg_32_xm0 }
- { id: 6, class: sreg_32_xm0 }
- { id: 7, class: sreg_32_xm0 }
- { id: 8, class: sreg_32 }
- { id: 9, class: vgpr_32 }
- { id: 10, class: vgpr_32 }
- { id: 11, class: vgpr_32 }
- { id: 12, class: vgpr_32 }
- { id: 13, class: vgpr_32 }
- { id: 14, class: vgpr_32 }
- { id: 15, class: vgpr_32 }
- { id: 16, class: vgpr_32 }
- { id: 17, class: vgpr_32 }
- { id: 18, class: vgpr_32 }
- { id: 19, class: vgpr_32 }
- { id: 20, class: vgpr_32 }
- { id: 21, class: vgpr_32 }
- { id: 22, class: vgpr_32 }
- { id: 23, class: vgpr_32 }
- { id: 24, class: vgpr_32 }
- { id: 25, class: vgpr_32 }
- { id: 26, class: vgpr_32 }
- { id: 27, class: vgpr_32 }
- { id: 28, class: vgpr_32 }
- { id: 29, class: vgpr_32 }
- { id: 30, class: vgpr_32 }
- { id: 31, class: vgpr_32 }
- { id: 32, class: vgpr_32 }
- { id: 33, class: vgpr_32 }
- { id: 34, class: vgpr_32 }
- { id: 35, class: vgpr_32 }
- { id: 36, class: vgpr_32 }
- { id: 37, class: vgpr_32 }
- { id: 38, class: vgpr_32 }
- { id: 39, class: vgpr_32 }
- { id: 40, class: vgpr_32 }
- { id: 41, class: vgpr_32 }
- { id: 42, class: vgpr_32 }
- { id: 43, class: vgpr_32 }
- { id: 44, class: vgpr_32 }
- { id: 45, class: vgpr_32 }
- { id: 46, class: vgpr_32 }
- { id: 47, class: vgpr_32 }
- { id: 48, class: vgpr_32 }
- { id: 49, class: vgpr_32 }
- { id: 50, class: vgpr_32 }
- { id: 51, class: vgpr_32 }
- { id: 52, class: vgpr_32 }
- { id: 53, class: vgpr_32 }
- { id: 54, class: vgpr_32 }
- { id: 55, class: vgpr_32 }
- { id: 56, class: vgpr_32 }
- { id: 57, class: vgpr_32 }
- { id: 58, class: vgpr_32 }
- { id: 59, class: vgpr_32 }
- { id: 60, class: vgpr_32 }
- { id: 100, class: vgpr_32 }
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
%2 = COPY $sgpr30_sgpr31
%1 = COPY $vgpr2_vgpr3
%0 = COPY $vgpr0_vgpr1
%3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
%5 = S_MOV_B32 65535
%6 = S_MOV_B32 65535
%11 = V_LSHRREV_B32_e64 16, %3, implicit $exec
%12 = V_AND_B32_e32 %6, %11, implicit $exec
%13 = V_LSHLREV_B32_e64 16, %12, implicit $exec
%14 = V_LSHRREV_B32_e64 16, %13, implicit $exec
%15 = V_BFE_U32 %13, 8, 8, implicit $exec
%16 = V_ADD_F32_e32 %14, %15, implicit $exec
%17 = V_LSHLREV_B32_e64 16, %16, implicit $exec
%18 = V_LSHRREV_B32_e64 16, %17, implicit $exec
%19 = V_BFE_U32 %17, 8, 8, implicit $exec
%20 = V_SUB_F16_e32 %18, %19, implicit $exec
%21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
%22 = V_BFE_U32 %20, 8, 8, implicit $exec
%23 = V_FMAC_F32_e32 %21, %22, %22, implicit $exec
%24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
%25 = V_LSHRREV_B32_e64 16, %24, implicit $exec
%26 = V_BFE_U32 %24, 8, 8, implicit $exec
%27 = V_FMAC_F16_e32 %25, %26, %26, implicit $exec
%28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
%29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
%30 = V_AND_B32_e64 23, %29, implicit $exec
%31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
%32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
%33 = V_BFE_U32 %31, 8, 8, implicit $exec
%34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $exec
%35 = V_LSHLREV_B32_e64 16, %34, implicit $exec
%37 = V_BFE_U32 %35, 8, 8, implicit $exec
%38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $exec
%39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
%40 = V_BFE_U32 %39, 8, 8, implicit $exec
%41 = V_FMAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $exec
%42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
%43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
%44 = V_BFE_U32 %42, 8, 8, implicit $exec
%45 = V_FMAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $exec
%46 = V_LSHLREV_B32_e64 16, %45, implicit $exec
%47 = V_LSHRREV_B32_e64 16, %46, implicit $exec
%48 = V_BFE_U32 %46, 8, 8, implicit $exec
%49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $exec
%50 = V_LSHLREV_B32_e64 16, %49, implicit $exec
%51 = V_BFE_U32 %50, 8, 8, implicit $exec
%52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $exec
%53 = V_LSHLREV_B32_e64 16, %52, implicit $exec
%54 = V_BFE_U32 %53, 8, 8, implicit $exec
%55 = V_FMAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $exec
%56 = V_LSHLREV_B32_e64 16, %55, implicit $exec
%57 = V_LSHRREV_B32_e64 16, %56, implicit $exec
%58 = V_BFE_U32 %56, 8, 8, implicit $exec
%59 = V_FMAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $exec
%60 = V_LSHLREV_B32_e64 16, %59, implicit $exec
%100 = V_MOV_B32_e32 %60, implicit $exec
FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
$sgpr30_sgpr31 = COPY %2
S_SETPC_B64_return $sgpr30_sgpr31
...

View File

@ -0,0 +1,183 @@
# RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: test_fmamk_reg_imm_f32
# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
---
name: test_fmamk_reg_imm_f32
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = V_MOV_B32_e32 1078523331, implicit $exec
%3 = V_FMAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $exec
...
# GCN-LABEL: name: test_fmamk_imm_reg_f32
# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
---
name: test_fmamk_imm_reg_f32
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = V_MOV_B32_e32 1078523331, implicit $exec
%3 = V_FMAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $exec
...
# GCN-LABEL: name: test_fmaak_f32
# GCN: V_FMAAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
---
name: test_fmaak_f32
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = V_MOV_B32_e32 1078523331, implicit $exec
%2 = V_FMAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
...
# GCN-LABEL: name: test_fmamk_reg_imm_f16
# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
---
name: test_fmamk_reg_imm_f16
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = V_MOV_B32_e32 1078523331, implicit $exec
%3 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $exec
...
# GCN-LABEL: name: test_fmamk_imm_reg_f16
# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
---
name: test_fmamk_imm_reg_f16
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = V_MOV_B32_e32 1078523331, implicit $exec
%3 = V_FMAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $exec
...
# GCN-LABEL: name: test_fmaak_f16
# GCN: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
---
name: test_fmaak_f16
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = V_MOV_B32_e32 1078523331, implicit $exec
%2 = V_FMAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
...
# GCN-LABEL: name: test_fmaak_sgpr_src0_f32
# GCN: %2:vgpr_32 = V_FMAMK_F32 killed %0, 1078523331, %3:vgpr_32, implicit $exec
---
name: test_fmaak_sgpr_src0_f32
registers:
- { id: 0, class: sreg_32_xm0 }
- { id: 1, class: vgpr_32}
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = V_MOV_B32_e32 1078523331, implicit $exec
%2 = V_FMAC_F32_e32 killed %0, %1, %3, implicit $exec
...
# GCN-LABEL: name: test_fmaak_inlineimm_src0_f32
# GCN: %1:vgpr_32 = V_FMAMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $exec
---
name: test_fmaak_inlineimm_src0_f32
registers:
- { id: 0, class: vgpr_32}
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = V_MOV_B32_e32 1078523331, implicit $exec
%1 = V_FMAC_F32_e32 1073741824, %0, %2, implicit $exec
...
# GCN-LABEL: name: test_fmaak_otherimm_src0_f32
# GCN: %1:vgpr_32 = V_FMAC_F32_e32 1120403456, %0, %1, implicit $exec
---
name: test_fmaak_otherimm_src0_f32
registers:
- { id: 0, class: vgpr_32}
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = V_MOV_B32_e32 1078523331, implicit $exec
%1 = V_FMAC_F32_e32 1120403456, %0, %2, implicit $exec
...
# GCN-LABEL: name: test_fmaak_other_constantlike_src0_f32
# GCN: %1:vgpr_32 = V_FMAC_F32_e32 %stack.0, %0, %1, implicit $exec
---
name: test_fmaak_other_constantlike_src0_f32
registers:
- { id: 0, class: vgpr_32}
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
stack:
- { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8,
callee-saved-register: '', local-offset: 0, debug-info-variable: '',
debug-info-expression: '', debug-info-location: '' }
body: |
bb.0:
%0 = V_MOV_B32_e32 1078523331, implicit $exec
%1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $exec
...