[AMDGPU] Combine DPP mov with use instructions (VOP1/2/3)

Introduces DPP pseudo instructions and the pass that combines DPP mov with subsequent uses.

Differential revision: https://reviews.llvm.org/D53762

llvm-svn: 347993
This commit is contained in:
Valery Pykhtin 2018-11-30 14:21:56 +00:00
parent 445b0b6260
commit 3d9afa273f
15 changed files with 1039 additions and 51 deletions

View File

@ -37,6 +37,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
// SI Passes
FunctionPass *createGCNDPPCombinePass();
FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
@ -93,6 +94,9 @@ extern char &AMDGPULowerKernelAttributesID;
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
void initializeGCNDPPCombinePass(PassRegistry &);
extern char &GCNDPPCombineID;
void initializeR600ClauseMergePassPass(PassRegistry &);
extern char &R600ClauseMergePassID;

View File

@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td"
include "llvm/Target/Target.td"
include "AMDGPUFeatures.td"
class BoolToList<bit Value> {
list<int> ret = !if(Value, [1]<int>, []<int>);
}
//===------------------------------------------------------------===//
// Subtarget Features (device properties)
//===------------------------------------------------------------===//

View File

@ -106,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole(
cl::desc("Enable SDWA peepholer"),
cl::init(true));
static cl::opt<bool> EnableDPPCombine(
"amdgpu-dpp-combine",
cl::desc("Enable DPP combiner"),
cl::init(false));
// Enable address space based alias analysis
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
@ -158,6 +163,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeR600VectorRegMergerPass(*PR);
initializeGlobalISel(*PR);
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
@ -790,6 +796,8 @@ void GCNPassConfig::addMachineSSAOptimization() {
//
// XXX - Can we get away without running DeadMachineInstructionElim again?
addPass(&SIFoldOperandsID);
if (EnableDPPCombine)
addPass(&GCNDPPCombineID);
addPass(&DeadMachineInstructionElimID);
addPass(&SILoadStoreOptimizerID);
if (EnableSDWAPeephole) {

View File

@ -5275,12 +5275,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
}
// All DPP instructions with at least one source operand have a fake "old"
// source at the beginning that's tied to the dst operand. Handle it here.
if (Desc.getNumOperands() >= 2)
Inst.addOperand(Inst.getOperand(0));
for (unsigned E = Operands.size(); I != E; ++I) {
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
MCOI::TIED_TO);
if (TiedTo != -1) {
assert((unsigned)TiedTo < Inst.getNumOperands());
// handle tied old or src2 for MAC instructions
Inst.addOperand(Inst.getOperand(TiedTo));
}
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {

View File

@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
GCNILPSched.cpp
GCNDPPCombine.cpp
)
add_subdirectory(AsmParser)

View File

@ -0,0 +1,446 @@
//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
// operand.If any of the use instruction cannot be combined with the mov the
// whole sequence is reverted.
//
// $old = ...
// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
// dpp_controls..., $bound_ctrl
// $res = VALU $dpp_value, ...
//
// to
//
// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
// dpp_controls..., $folded_bound_ctrl
//
// Combining rules :
//
// $bound_ctrl is DPP_BOUND_ZERO, $old is any
// $bound_ctrl is DPP_BOUND_OFF, $old is 0
//
// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
// $bound_ctrl is DPP_BOUND_OFF, $old is undef
//
// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
//
// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/Pass.h"
#include <cassert>
using namespace llvm;
#define DEBUG_TYPE "gcn-dpp-combine"
STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
namespace {
class GCNDPPCombine : public MachineFunctionPass {
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
RegSubRegPair OldOpndVGPR,
MachineOperand &OldOpndValue) const;
MachineInstr *createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair OldOpndVGPR,
MachineOperand *OldOpnd,
bool BoundCtrlZero) const;
MachineInstr *createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair OldOpndVGPR,
bool BoundCtrlZero) const;
bool hasNoImmOrEqual(MachineInstr &MI,
unsigned OpndName,
int64_t Value,
int64_t Mask = -1) const;
bool combineDPPMov(MachineInstr &MI) const;
public:
static char ID;
GCNDPPCombine() : MachineFunctionPass(ID) {
initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override { return "GCN DPP Combine"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // end anonymous namespace
INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
char GCNDPPCombine::ID = 0;
char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
FunctionPass *llvm::createGCNDPPCombinePass() {
return new GCNDPPCombine();
}
static int getDPPOp(unsigned Op) {
auto DPP32 = AMDGPU::getDPPOp32(Op);
if (DPP32 != -1)
return DPP32;
auto E32 = AMDGPU::getVOPe32(Op);
return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
}
// tracks the register operand definition and returns:
// 1. immediate operand used to initialize the register if found
// 2. nullptr if the register operand is undef
// 3. the operand itself otherwise
MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
if (!Def)
return nullptr;
switch(Def->getOpcode()) {
default: break;
case AMDGPU::IMPLICIT_DEF:
return nullptr;
case AMDGPU::COPY:
case AMDGPU::V_MOV_B32_e32: {
auto &Op1 = Def->getOperand(1);
if (Op1.isImm())
return &Op1;
break;
}
}
return &OldOpnd;
}
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair OldOpndVGPR,
bool BoundCtrlZero) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
auto OrigOp = OrigMI.getOpcode();
auto DPPOp = getDPPOp(OrigOp);
if (DPPOp == -1) {
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
return nullptr;
}
auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
OrigMI.getDebugLoc(), TII->get(DPPOp));
bool Fail = false;
do {
auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
assert(Dst);
DPPInst.add(*Dst);
int NumOperands = 1;
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
assert(OldIdx == NumOperands);
assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
++NumOperands;
}
if (auto *Mod0 = TII->getNamedOperand(OrigMI,
AMDGPU::OpName::src0_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src0_modifiers));
assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
DPPInst.addImm(Mod0->getImm());
++NumOperands;
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
Fail = true;
break;
}
DPPInst.add(*Src0);
++NumOperands;
if (auto *Mod1 = TII->getNamedOperand(OrigMI,
AMDGPU::OpName::src1_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src1_modifiers));
assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
DPPInst.addImm(Mod1->getImm());
++NumOperands;
}
if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
break;
}
DPPInst.add(*Src1);
++NumOperands;
}
if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
Fail = true;
break;
}
DPPInst.add(*Src2);
}
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
DPPInst.addImm(BoundCtrlZero ? 1 : 0);
} while (false);
if (Fail) {
DPPInst.getInstr()->eraseFromParent();
return nullptr;
}
LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
return DPPInst.getInstr();
}
GCNDPPCombine::RegSubRegPair
GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
RegSubRegPair OldOpndVGPR,
MachineOperand &OldOpndValue) const {
assert(OldOpndValue.isImm());
switch (OrigMI.getOpcode()) {
default: break;
case AMDGPU::V_MAX_U32_e32:
if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
return OldOpndVGPR;
break;
case AMDGPU::V_MAX_I32_e32:
if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
return OldOpndVGPR;
break;
case AMDGPU::V_MIN_I32_e32:
if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
return OldOpndVGPR;
break;
case AMDGPU::V_MUL_I32_I24_e32:
case AMDGPU::V_MUL_U32_U24_e32:
if (OldOpndValue.getImm() == 1) {
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
assert(Src1 && Src1->isReg());
return getRegSubRegPair(*Src1);
}
break;
}
return RegSubRegPair();
}
// Cases to combine:
// $bound_ctrl is DPP_BOUND_ZERO, $old is any
// $bound_ctrl is DPP_BOUND_OFF, $old is 0
// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
// $bound_ctrl is DPP_BOUND_OFF, $old is undef
// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair OldOpndVGPR,
MachineOperand *OldOpndValue,
bool BoundCtrlZero) const {
assert(OldOpndVGPR.Reg);
if (!BoundCtrlZero && OldOpndValue) {
assert(OldOpndValue->isImm());
OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
if (!OldOpndVGPR.Reg) {
LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n");
return nullptr;
}
}
return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
}
// returns true if MI doesn't have OpndName immediate operand or the
// operand has Value
bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
int64_t Value, int64_t Mask) const {
auto *Imm = TII->getNamedOperand(MI, OpndName);
if (!Imm)
return true;
assert(Imm->isImm());
return (Imm->getImm() & Mask) == Value;
}
bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
assert(BCZOpnd && BCZOpnd->isImm());
bool BoundCtrlZero = 0 != BCZOpnd->getImm();
LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
assert(OldOpnd && OldOpnd->isReg());
auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
auto *OldOpndValue = getOldOpndValue(*OldOpnd);
assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
if (OldOpndValue) {
if (BoundCtrlZero) {
OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
OldOpndValue = nullptr;
} else {
if (!OldOpndValue->isImm()) {
LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n");
return false;
}
if (OldOpndValue->getImm() == 0) {
OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
OldOpndValue = nullptr;
BoundCtrlZero = true;
}
}
}
LLVM_DEBUG(dbgs() << " old=";
if (!OldOpndValue)
dbgs() << "undef";
else
dbgs() << OldOpndValue->getImm();
dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
std::vector<MachineInstr*> OrigMIs, DPPMIs;
if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
OldOpndVGPR = RegSubRegPair(
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
DPPMIs.push_back(UndefInst.getInstr());
}
OrigMIs.push_back(&MovMI);
bool Rollback = true;
for (auto &Use : MRI->use_nodbg_operands(
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
Rollback = true;
auto &OrigMI = *Use.getParent();
auto OrigOp = OrigMI.getOpcode();
if (TII->isVOP3(OrigOp)) {
if (!TII->hasVALU32BitEncoding(OrigOp)) {
LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
break;
}
// check if other than abs|neg modifiers are set (opsel for example)
const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
break;
}
} else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
break;
}
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
OldOpndValue, BoundCtrlZero)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
} else if (OrigMI.isCommutable() &&
&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
auto *BB = OrigMI.getParent();
auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
BB->insert(OrigMI, NewMI);
if (TII->commuteInstruction(*NewMI)) {
LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
OldOpndValue, BoundCtrlZero)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
} else
LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
NewMI->eraseFromParent();
} else
LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
if (Rollback)
break;
OrigMIs.push_back(&OrigMI);
}
for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
MI->eraseFromParent();
return !Rollback;
}
bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
auto &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasDPP() || skipFunction(MF.getFunction()))
return false;
MRI = &MF.getRegInfo();
TII = ST.getInstrInfo();
assert(MRI->isSSA() && "Must be run on SSA");
bool Changed = false;
for (auto &MBB : MF) {
for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
auto &MI = *I++;
if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
Changed = true;
++NumDPPMovsCombined;
}
}
}
return Changed;
}

View File

@ -5632,3 +5632,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
return MCOp;
}
static
TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
assert(RegOpnd.isReg());
return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
getRegSubRegPair(RegOpnd);
}
TargetInstrInfo::RegSubRegPair
llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
assert(MI.isRegSequence());
for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
auto &RegOp = MI.getOperand(1 + 2 * I);
return getRegOrUndef(RegOp);
}
return TargetInstrInfo::RegSubRegPair();
}
// Try to find the definition of reg:subreg in subreg-manipulation pseudos
// Following a subreg of reg:subreg isn't supported
static bool followSubRegDef(MachineInstr &MI,
TargetInstrInfo::RegSubRegPair &RSR) {
if (!RSR.SubReg)
return false;
switch (MI.getOpcode()) {
default: break;
case AMDGPU::REG_SEQUENCE:
RSR = getRegSequenceSubReg(MI, RSR.SubReg);
return true;
// EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
case AMDGPU::INSERT_SUBREG:
if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
// inserted the subreg we're looking for
RSR = getRegOrUndef(MI.getOperand(2));
else { // the subreg in the rest of the reg
auto R1 = getRegOrUndef(MI.getOperand(1));
if (R1.SubReg) // subreg of subreg isn't supported
return false;
RSR.Reg = R1.Reg;
}
return true;
}
return false;
}
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
return nullptr;
auto RSR = P;
auto *DefInst = MRI.getVRegDef(RSR.Reg);
while (auto *MI = DefInst) {
DefInst = nullptr;
switch (MI->getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::V_MOV_B32_e32: {
auto &Op1 = MI->getOperand(1);
if (Op1.isReg() &&
TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
if (Op1.isUndef())
return nullptr;
RSR = getRegSubRegPair(Op1);
DefInst = MRI.getVRegDef(RSR.Reg);
}
break;
}
default:
if (followSubRegDef(*MI, RSR)) {
if (!RSR.Reg)
return nullptr;
DefInst = MRI.getVRegDef(RSR.Reg);
}
}
if (!DefInst)
return MI;
}
return nullptr;
}

View File

@ -917,9 +917,36 @@ public:
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P,
const TargetRegisterClass &TRC,
MachineRegisterInfo &MRI) {
auto *RC = MRI.getRegClass(P.Reg);
if (!P.SubReg)
return RC == &TRC;
auto *TRI = MRI.getTargetRegisterInfo();
return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg);
}
/// \brief Create RegSubRegPair from a register MachineOperand
inline
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) {
assert(O.isReg());
return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg());
}
/// \brief Return the SubReg component from REG_SEQUENCE
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
unsigned SubReg);
/// \brief Return the defining instruction for a given reg:subreg pair
/// skipping copy like instructions and subreg-manipulation pseudos.
/// Following another subreg of a reg:subreg isn't supported.
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI);
namespace AMDGPU {
LLVM_READONLY
@ -931,6 +958,9 @@ namespace AMDGPU {
LLVM_READONLY
int getSDWAOp(uint16_t Opcode);
LLVM_READONLY
int getDPPOp32(uint16_t Opcode);
LLVM_READONLY
int getBasicFromSDWAOp(uint16_t Opcode);

View File

@ -1622,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
0, // 64-bit dst - No DPP or SDWA for 64-bit operands
!if(!eq(Src0VT.Size, 64),
0, // 64-bit src0
!if(!eq(Src0VT.Size, 64),
!if(!eq(Src1VT.Size, 64),
0, // 64-bit src2
1
)
@ -1631,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
);
}
class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
bit ret = !if(!eq(NumSrcArgs, 0), 0,
getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
}
class BitOr<bit a, bit b> {
bit ret = !if(a, 1, !if(b, 1, 0));
}
@ -1710,7 +1716,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasSDWAOMod = isFloatType<DstVT>.ret;
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtDPP = HasExt;
field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA = HasExt;
field bit HasExtSDWA9 = HasExt;
field int NeedPatGen = PatGenMode.NoPattern;
@ -1741,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
getOpSelMod<Src0VT>.ret,
getOpSelMod<Src1VT>.ret,
getOpSelMod<Src2VT>.ret>.ret;
field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
field dag InsDPP = !if(HasExtDPP,
getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
(ins));
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
@ -1756,7 +1764,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
HasSrc0FloatMods,
HasSrc1FloatMods,
HasSrc2FloatMods>.ret;
field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
field string AsmDPP = !if(HasExtDPP,
getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
}
@ -1931,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping {
let ValueCols = [["Default"]];
}
// Maps ordinary instructions to their DPP counterparts
def getDPPOp32 : InstrMapping {
let FilterClass = "VOP";
let RowFields = ["OpName"];
let ColFields = ["AsmVariantName"];
let KeyCol = ["Default"];
let ValueCols = [["DPP"]];
}
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
let FilterClass = "Commutable_REV";

View File

@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP1";
}
class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOP_DPP_Pseudo <OpName, P, pattern> {
}
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret =
!if(P.HasModifiers,
@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P,
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP1_DPP_Pseudo <opName, P>;
}
// Special profile for instructions which have clamp
@ -500,13 +506,8 @@ defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
// VI
//===----------------------------------------------------------------------===//
class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
VOP_DPP <ps.OpName, P> {
let Defs = ps.Defs;
let Uses = ps.Uses;
let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
VOP_DPPe <P> {
bits<8> vdst;
let Inst{8-0} = 0xfa; // dpp
let Inst{16-9} = op;
@ -544,9 +545,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_vi :
VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
}
defm V_NOP : VOP1_Real_vi <0x0>;
@ -717,9 +719,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
}
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;

View File

@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP2";
}
class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOP_DPP_Pseudo <OpName, P, pattern> {
}
class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst,
@ -155,7 +160,12 @@ multiclass VOP2Inst<string opName,
bit GFX9Renamed = 0> :
VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed>;
VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> {
let renamedInGFX9 = GFX9Renamed in {
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
}
multiclass VOP2bInst <string opName,
VOPProfile P,
@ -172,6 +182,8 @@ multiclass VOP2bInst <string opName,
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@ -194,6 +206,9 @@ multiclass VOP2eInst <string opName,
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@ -233,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins DstRCDPP:$old,
Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
@ -778,13 +793,8 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
// VI
//===----------------------------------------------------------------------===//
class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> :
VOP_DPP <OpName, P> {
let Defs = ps.Defs;
let Uses = ps.Uses;
let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
VOP_DPPe <P> {
bits<8> vdst;
bits<8> src1;
let Inst{8-0} = 0xfa; //dpp
@ -865,8 +875,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
let AsmString = AsmName # ps.AsmOperands;
}
def _dpp :
VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>;
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_vi :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
let AsmString = AsmName # ps.AsmOperands;
}
}
}
@ -893,10 +908,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
let AsmString = AsmName # ps.AsmOperands;
}
def _dpp_gfx9 :
VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> {
let DecoderNamespace = "SDWA9";
}
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
let AsmString = AsmName # ps.AsmOperands;
let DecoderNamespace = "SDWA9";
}
}
multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
@ -914,19 +933,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
}
def _dpp_gfx9 :
VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "SDWA9";
}
foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA9";
}
}
} // AssemblerPredicates = [isGFX9]
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_vi :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>;

View File

@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 {
let Inst{63-60} = row_mask;
}
class VOP_DPP <string OpName, VOPProfile P> :
InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
VOP_DPPe<P> {
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
VOP <OpName>,
SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
MnemonicAlias <OpName#"_dpp", OpName> {
let isPseudo = 1;
let isCodeGenOnly = 1;
let mayLoad = 0;
let mayStore = 0;
@ -517,6 +522,11 @@ class VOP_DPP <string OpName, VOPProfile P> :
let VALU = 1;
let DPP = 1;
let Size = 8;
let Uses = [EXEC];
let isConvergent = 1;
string Mnemonic = OpName;
string AsmOperands = P.AsmDPP;
let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
let SubtargetPredicate = HasDPP;
@ -526,6 +536,36 @@ class VOP_DPP <string OpName, VOPProfile P> :
let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
let DecoderNamespace = "DPP";
VOPProfile Pfl = P;
}
class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
let isPseudo = 0;
let isCodeGenOnly = 0;
let Defs = ps.Defs;
let Uses = ps.Uses;
let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
// Copy relevant pseudo op flags
let isConvergent = ps.isConvergent;
let SubtargetPredicate = ps.SubtargetPredicate;
let AssemblerPredicate = ps.AssemblerPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let DecoderNamespace = ps.DecoderNamespace;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
}
class getNumNodeArgs<SDPatternOperator Op> {

View File

@ -0,0 +1,185 @@
; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine -verify-machineinstrs < %s | FileCheck %s
; VOP2 with literal cannot be combined
; CHECK-LABEL: {{^}}dpp_combine_i32_literal:
; CHECK: v_mov_b32_dpp [[OLD:v[0-9]+]], {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x1 bound_ctrl:0
; CHECK: v_add_u32_e32 {{v[0-9]+}}, vcc, 42, [[OLD]]
define amdgpu_kernel void @dpp_combine_i32_literal(i32 addrspace(1)* %out, i32 %in) {
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 2, i32 1, i1 1) #0
%res = add nsw i32 %dpp, 42
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_i32_bz:
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
define amdgpu_kernel void @dpp_combine_i32_bz(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
%res = add nsw i32 %dpp, %x
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_undef:
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
define amdgpu_kernel void @dpp_combine_i32_boff_undef(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
%res = add nsw i32 %dpp, %x
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_0:
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
define amdgpu_kernel void @dpp_combine_i32_boff_0(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
%res = add nsw i32 %dpp, %x
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_max:
; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], -2
; CHECK: v_max_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
define amdgpu_kernel void @dpp_combine_i32_boff_max(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 2147483647, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
%cmp = icmp sge i32 %dpp, %x
%res = select i1 %cmp, i32 %dpp, i32 %x
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_min:
; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], 1
; CHECK: v_min_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
define amdgpu_kernel void @dpp_combine_i32_boff_min(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
%cmp = icmp sle i32 %dpp, %x
%res = select i1 %cmp, i32 %dpp, i32 %x
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_mul:
; CHECK: v_mul_i32_i24_dpp v0, v3, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
define amdgpu_kernel void @dpp_combine_i32_boff_mul(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 1, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
%dpp.shl = shl i32 %dpp, 8
%dpp.24 = ashr i32 %dpp.shl, 8
%x.shl = shl i32 %x, 8
%x.24 = ashr i32 %x.shl, 8
%res = mul i32 %dpp.24, %x.24
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_i32_commute:
; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
define amdgpu_kernel void @dpp_combine_i32_commute(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 2, i32 1, i32 1, i1 1) #0
%res = sub nsw i32 %x, %dpp
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_f32:
; CHECK: v_add_f32_dpp {{v[0-9]+}}, {{v[0-9]+}}, v0 quad_perm:[3,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
define amdgpu_kernel void @dpp_combine_f32(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 3, i32 1, i32 1, i1 1) #0
%dpp.f32 = bitcast i32 %dpp to float
%x.f32 = bitcast i32 %x to float
%res.f32 = fadd float %x.f32, %dpp.f32
%res = bitcast float %res.f32 to i32
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_test_f32_mods:
; CHECK: v_mul_f32_dpp {{v[0-9]+}}, |{{v[0-9]+}}|, -v0 quad_perm:[0,1,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
define amdgpu_kernel void @dpp_combine_test_f32_mods(i32 addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 4, i32 1, i32 1, i1 1) #0
%x.f32 = bitcast i32 %x to float
%x.f32.neg = fsub float -0.000000e+00, %x.f32
%dpp.f32 = bitcast i32 %dpp to float
%dpp.f32.cmp = fcmp fast olt float %dpp.f32, 0.000000e+00
%dpp.f32.sign = select i1 %dpp.f32.cmp, float -1.000000e+00, float 1.000000e+00
%dpp.f32.abs = fmul fast float %dpp.f32, %dpp.f32.sign
%res.f32 = fmul float %x.f32.neg, %dpp.f32.abs
%res = bitcast float %res.f32 to i32
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_mac:
; CHECK: v_mac_f32_dpp v0, {{v[0-9]+}}, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
define amdgpu_kernel void @dpp_combine_mac(float addrspace(1)* %out, i32 %in) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%y = tail call i32 @llvm.amdgcn.workitem.id.y()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
%dpp.f32 = bitcast i32 %dpp to float
%x.f32 = bitcast i32 %x to float
%y.f32 = bitcast i32 %y to float
%mult = fmul float %dpp.f32, %y.f32
%res = fadd float %mult, %x.f32
store float %res, float addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_sequence:
define amdgpu_kernel void @dpp_combine_sequence(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
br i1 %cmp, label %bb1, label %bb2
bb1:
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
%resadd = add nsw i32 %dpp, %x
br label %bb3
bb2:
; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
%ressub = sub nsw i32 %x, %dpp
br label %bb3
bb3:
%res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
store i32 %res, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: {{^}}dpp_combine_sequence_negative:
; CHECK: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
define amdgpu_kernel void @dpp_combine_sequence_negative(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
br i1 %cmp, label %bb1, label %bb2
bb1:
%resadd = add nsw i32 %dpp, %x
br label %bb3
bb2:
%ressub = sub nsw i32 2, %dpp ; break seq
br label %bb3
bb3:
%res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
store i32 %res, i32 addrspace(1)* %out
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.workitem.id.y()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
attributes #0 = { nounwind readnone convergent }

View File

@ -0,0 +1,143 @@
# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=gcn-dpp-combine -o - %s | FileCheck %s
# test if $old definition is correctly tracked through subreg manipulation pseudos
---
# CHECK-LABEL: name: mul_old_subreg
# CHECK: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
name: mul_old_subreg
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_32 }
- { id: 4, class: vreg_64 }
- { id: 5, class: vreg_64 }
- { id: 6, class: vgpr_32 }
- { id: 7, class: vgpr_32 }
liveins:
- { reg: '$vgpr0', virtual-reg: '%0' }
- { reg: '$vgpr1', virtual-reg: '%1' }
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vreg_64 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
%3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
%4 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
%5 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4
%6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec
%7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec
...
# CHECK-LABEL: name: add_old_subreg
# CHECK: [[OLD:\%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
# CHECK: %5:vgpr_32 = V_ADD_U32_dpp [[OLD]], %1, %0.sub1, 1, 1, 1, 1, implicit $exec
name: add_old_subreg
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vreg_64 }
- { id: 4, class: vgpr_32 }
- { id: 5, class: vgpr_32 }
liveins:
- { reg: '$vgpr0', virtual-reg: '%0' }
- { reg: '$vgpr1', virtual-reg: '%1' }
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vreg_64 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
...
# CHECK-LABEL: name: add_old_subreg_undef
# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %3.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
name: add_old_subreg_undef
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vreg_64 }
- { id: 4, class: vgpr_32 }
- { id: 5, class: vgpr_32 }
liveins:
- { reg: '$vgpr0', virtual-reg: '%0' }
- { reg: '$vgpr1', virtual-reg: '%1' }
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vreg_64 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
...
# CHECK-LABEL: name: add_f32_e64
# CHECK: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
# CHECK: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
# CHECK: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 1, 1, 1, implicit $exec
# CHECK: %7:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 1, 1, 1, implicit $exec
# CHECK: %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
name: add_f32_e64
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32 }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_32 }
- { id: 4, class: vgpr_32 }
- { id: 5, class: vgpr_32 }
- { id: 6, class: vgpr_32 }
- { id: 7, class: vgpr_32 }
- { id: 8, class: vgpr_32 }
- { id: 9, class: vgpr_32 }
liveins:
- { reg: '$vgpr0', virtual-reg: '%0' }
- { reg: '$vgpr1', virtual-reg: '%1' }
body: |
bb.0:
liveins: $vgpr0, $vgpr1
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
; this shouldn't be combined as omod is set
%4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
%5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
; this should be combined as all modifiers are default
%6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $exec
; this should be combined as modifiers other than abs|neg are default
%7:vgpr_32 = V_ADD_F32_e64 1, %5, 2, %0, 0, 0, implicit $exec
%8:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
; this shouldn't be combined as modifiers aren't abs|neg
%9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
...

View File

@ -116,7 +116,6 @@ v_add_f32 v0, |v0|, -v0 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
//===----------------------------------------------------------------------===//
// NOSICI: error:
// VI9: v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x00,0x00,0x7e,0x00,0x01,0x09,0xa1]
v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
// NOSICI: error: