[AMDGPU] Combine DPP mov with use instructions (VOP1/2/3)
Introduces DPP pseudo instructions and the pass that combines DPP mov with subsequent uses. Differential revision: https://reviews.llvm.org/D53762 llvm-svn: 347993
This commit is contained in:
parent
445b0b6260
commit
3d9afa273f
|
@ -37,6 +37,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
|
|||
FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
|
||||
|
||||
// SI Passes
|
||||
FunctionPass *createGCNDPPCombinePass();
|
||||
FunctionPass *createSIAnnotateControlFlowPass();
|
||||
FunctionPass *createSIFoldOperandsPass();
|
||||
FunctionPass *createSIPeepholeSDWAPass();
|
||||
|
@ -93,6 +94,9 @@ extern char &AMDGPULowerKernelAttributesID;
|
|||
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
|
||||
extern char &AMDGPURewriteOutArgumentsID;
|
||||
|
||||
void initializeGCNDPPCombinePass(PassRegistry &);
|
||||
extern char &GCNDPPCombineID;
|
||||
|
||||
void initializeR600ClauseMergePassPass(PassRegistry &);
|
||||
extern char &R600ClauseMergePassID;
|
||||
|
||||
|
|
|
@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td"
|
|||
include "llvm/Target/Target.td"
|
||||
include "AMDGPUFeatures.td"
|
||||
|
||||
class BoolToList<bit Value> {
|
||||
list<int> ret = !if(Value, [1]<int>, []<int>);
|
||||
}
|
||||
|
||||
//===------------------------------------------------------------===//
|
||||
// Subtarget Features (device properties)
|
||||
//===------------------------------------------------------------===//
|
||||
|
|
|
@ -106,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole(
|
|||
cl::desc("Enable SDWA peepholer"),
|
||||
cl::init(true));
|
||||
|
||||
static cl::opt<bool> EnableDPPCombine(
|
||||
"amdgpu-dpp-combine",
|
||||
cl::desc("Enable DPP combiner"),
|
||||
cl::init(false));
|
||||
|
||||
// Enable address space based alias analysis
|
||||
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
|
||||
cl::desc("Enable AMDGPU Alias Analysis"),
|
||||
|
@ -158,6 +163,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
|
|||
initializeR600VectorRegMergerPass(*PR);
|
||||
initializeGlobalISel(*PR);
|
||||
initializeAMDGPUDAGToDAGISelPass(*PR);
|
||||
initializeGCNDPPCombinePass(*PR);
|
||||
initializeSILowerI1CopiesPass(*PR);
|
||||
initializeSIFixSGPRCopiesPass(*PR);
|
||||
initializeSIFixVGPRCopiesPass(*PR);
|
||||
|
@ -790,6 +796,8 @@ void GCNPassConfig::addMachineSSAOptimization() {
|
|||
//
|
||||
// XXX - Can we get away without running DeadMachineInstructionElim again?
|
||||
addPass(&SIFoldOperandsID);
|
||||
if (EnableDPPCombine)
|
||||
addPass(&GCNDPPCombineID);
|
||||
addPass(&DeadMachineInstructionElimID);
|
||||
addPass(&SILoadStoreOptimizerID);
|
||||
if (EnableSDWAPeephole) {
|
||||
|
|
|
@ -5275,12 +5275,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
|
|||
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
|
||||
}
|
||||
|
||||
// All DPP instructions with at least one source operand have a fake "old"
|
||||
// source at the beginning that's tied to the dst operand. Handle it here.
|
||||
if (Desc.getNumOperands() >= 2)
|
||||
Inst.addOperand(Inst.getOperand(0));
|
||||
|
||||
for (unsigned E = Operands.size(); I != E; ++I) {
|
||||
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
|
||||
MCOI::TIED_TO);
|
||||
if (TiedTo != -1) {
|
||||
assert((unsigned)TiedTo < Inst.getNumOperands());
|
||||
// handle tied old or src2 for MAC instructions
|
||||
Inst.addOperand(Inst.getOperand(TiedTo));
|
||||
}
|
||||
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
|
||||
// Add the register arguments
|
||||
if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
|
||||
|
|
|
@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen
|
|||
SIShrinkInstructions.cpp
|
||||
SIWholeQuadMode.cpp
|
||||
GCNILPSched.cpp
|
||||
GCNDPPCombine.cpp
|
||||
)
|
||||
|
||||
add_subdirectory(AsmParser)
|
||||
|
|
|
@ -0,0 +1,446 @@
|
|||
//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
|
||||
// operand.If any of the use instruction cannot be combined with the mov the
|
||||
// whole sequence is reverted.
|
||||
//
|
||||
// $old = ...
|
||||
// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
|
||||
// dpp_controls..., $bound_ctrl
|
||||
// $res = VALU $dpp_value, ...
|
||||
//
|
||||
// to
|
||||
//
|
||||
// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
|
||||
// dpp_controls..., $folded_bound_ctrl
|
||||
//
|
||||
// Combining rules :
|
||||
//
|
||||
// $bound_ctrl is DPP_BOUND_ZERO, $old is any
|
||||
// $bound_ctrl is DPP_BOUND_OFF, $old is 0
|
||||
//
|
||||
// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
|
||||
// $bound_ctrl is DPP_BOUND_OFF, $old is undef
|
||||
//
|
||||
// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
|
||||
// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
|
||||
//
|
||||
// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/CodeGen/MachineBasicBlock.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineOperand.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include <cassert>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "gcn-dpp-combine"
|
||||
|
||||
STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
|
||||
|
||||
namespace {
|
||||
|
||||
class GCNDPPCombine : public MachineFunctionPass {
|
||||
MachineRegisterInfo *MRI;
|
||||
const SIInstrInfo *TII;
|
||||
|
||||
using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
|
||||
|
||||
MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
|
||||
|
||||
RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
|
||||
RegSubRegPair OldOpndVGPR,
|
||||
MachineOperand &OldOpndValue) const;
|
||||
|
||||
MachineInstr *createDPPInst(MachineInstr &OrigMI,
|
||||
MachineInstr &MovMI,
|
||||
RegSubRegPair OldOpndVGPR,
|
||||
MachineOperand *OldOpnd,
|
||||
bool BoundCtrlZero) const;
|
||||
|
||||
MachineInstr *createDPPInst(MachineInstr &OrigMI,
|
||||
MachineInstr &MovMI,
|
||||
RegSubRegPair OldOpndVGPR,
|
||||
bool BoundCtrlZero) const;
|
||||
|
||||
bool hasNoImmOrEqual(MachineInstr &MI,
|
||||
unsigned OpndName,
|
||||
int64_t Value,
|
||||
int64_t Mask = -1) const;
|
||||
|
||||
bool combineDPPMov(MachineInstr &MI) const;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
GCNDPPCombine() : MachineFunctionPass(ID) {
|
||||
initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
|
||||
StringRef getPassName() const override { return "GCN DPP Combine"; }
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
|
||||
|
||||
char GCNDPPCombine::ID = 0;
|
||||
|
||||
char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
|
||||
|
||||
FunctionPass *llvm::createGCNDPPCombinePass() {
|
||||
return new GCNDPPCombine();
|
||||
}
|
||||
|
||||
static int getDPPOp(unsigned Op) {
|
||||
auto DPP32 = AMDGPU::getDPPOp32(Op);
|
||||
if (DPP32 != -1)
|
||||
return DPP32;
|
||||
|
||||
auto E32 = AMDGPU::getVOPe32(Op);
|
||||
return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
|
||||
}
|
||||
|
||||
// tracks the register operand definition and returns:
|
||||
// 1. immediate operand used to initialize the register if found
|
||||
// 2. nullptr if the register operand is undef
|
||||
// 3. the operand itself otherwise
|
||||
MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
|
||||
auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
|
||||
if (!Def)
|
||||
return nullptr;
|
||||
|
||||
switch(Def->getOpcode()) {
|
||||
default: break;
|
||||
case AMDGPU::IMPLICIT_DEF:
|
||||
return nullptr;
|
||||
case AMDGPU::COPY:
|
||||
case AMDGPU::V_MOV_B32_e32: {
|
||||
auto &Op1 = Def->getOperand(1);
|
||||
if (Op1.isImm())
|
||||
return &Op1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return &OldOpnd;
|
||||
}
|
||||
|
||||
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
||||
MachineInstr &MovMI,
|
||||
RegSubRegPair OldOpndVGPR,
|
||||
bool BoundCtrlZero) const {
|
||||
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
|
||||
assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
|
||||
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
|
||||
|
||||
auto OrigOp = OrigMI.getOpcode();
|
||||
auto DPPOp = getDPPOp(OrigOp);
|
||||
if (DPPOp == -1) {
|
||||
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
|
||||
OrigMI.getDebugLoc(), TII->get(DPPOp));
|
||||
bool Fail = false;
|
||||
do {
|
||||
auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
|
||||
assert(Dst);
|
||||
DPPInst.add(*Dst);
|
||||
int NumOperands = 1;
|
||||
|
||||
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
|
||||
if (OldIdx != -1) {
|
||||
assert(OldIdx == NumOperands);
|
||||
assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
|
||||
DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
|
||||
++NumOperands;
|
||||
}
|
||||
|
||||
if (auto *Mod0 = TII->getNamedOperand(OrigMI,
|
||||
AMDGPU::OpName::src0_modifiers)) {
|
||||
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
|
||||
AMDGPU::OpName::src0_modifiers));
|
||||
assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
|
||||
DPPInst.addImm(Mod0->getImm());
|
||||
++NumOperands;
|
||||
}
|
||||
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
|
||||
assert(Src0);
|
||||
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
|
||||
Fail = true;
|
||||
break;
|
||||
}
|
||||
DPPInst.add(*Src0);
|
||||
++NumOperands;
|
||||
|
||||
if (auto *Mod1 = TII->getNamedOperand(OrigMI,
|
||||
AMDGPU::OpName::src1_modifiers)) {
|
||||
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
|
||||
AMDGPU::OpName::src1_modifiers));
|
||||
assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
|
||||
DPPInst.addImm(Mod1->getImm());
|
||||
++NumOperands;
|
||||
}
|
||||
if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
|
||||
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
|
||||
Fail = true;
|
||||
break;
|
||||
}
|
||||
DPPInst.add(*Src1);
|
||||
++NumOperands;
|
||||
}
|
||||
|
||||
if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
|
||||
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
|
||||
Fail = true;
|
||||
break;
|
||||
}
|
||||
DPPInst.add(*Src2);
|
||||
}
|
||||
|
||||
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
|
||||
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
|
||||
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
|
||||
DPPInst.addImm(BoundCtrlZero ? 1 : 0);
|
||||
} while (false);
|
||||
|
||||
if (Fail) {
|
||||
DPPInst.getInstr()->eraseFromParent();
|
||||
return nullptr;
|
||||
}
|
||||
LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
|
||||
return DPPInst.getInstr();
|
||||
}
|
||||
|
||||
GCNDPPCombine::RegSubRegPair
|
||||
GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
|
||||
RegSubRegPair OldOpndVGPR,
|
||||
MachineOperand &OldOpndValue) const {
|
||||
assert(OldOpndValue.isImm());
|
||||
switch (OrigMI.getOpcode()) {
|
||||
default: break;
|
||||
case AMDGPU::V_MAX_U32_e32:
|
||||
if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
|
||||
return OldOpndVGPR;
|
||||
break;
|
||||
case AMDGPU::V_MAX_I32_e32:
|
||||
if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
|
||||
return OldOpndVGPR;
|
||||
break;
|
||||
case AMDGPU::V_MIN_I32_e32:
|
||||
if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
|
||||
return OldOpndVGPR;
|
||||
break;
|
||||
|
||||
case AMDGPU::V_MUL_I32_I24_e32:
|
||||
case AMDGPU::V_MUL_U32_U24_e32:
|
||||
if (OldOpndValue.getImm() == 1) {
|
||||
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
|
||||
assert(Src1 && Src1->isReg());
|
||||
return getRegSubRegPair(*Src1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
return RegSubRegPair();
|
||||
}
|
||||
|
||||
// Cases to combine:
|
||||
// $bound_ctrl is DPP_BOUND_ZERO, $old is any
|
||||
// $bound_ctrl is DPP_BOUND_OFF, $old is 0
|
||||
// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
|
||||
|
||||
// $bound_ctrl is DPP_BOUND_OFF, $old is undef
|
||||
// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
|
||||
|
||||
// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
|
||||
// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
|
||||
|
||||
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
|
||||
MachineInstr &MovMI,
|
||||
RegSubRegPair OldOpndVGPR,
|
||||
MachineOperand *OldOpndValue,
|
||||
bool BoundCtrlZero) const {
|
||||
assert(OldOpndVGPR.Reg);
|
||||
if (!BoundCtrlZero && OldOpndValue) {
|
||||
assert(OldOpndValue->isImm());
|
||||
OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
|
||||
if (!OldOpndVGPR.Reg) {
|
||||
LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
|
||||
}
|
||||
|
||||
// returns true if MI doesn't have OpndName immediate operand or the
|
||||
// operand has Value
|
||||
bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
|
||||
int64_t Value, int64_t Mask) const {
|
||||
auto *Imm = TII->getNamedOperand(MI, OpndName);
|
||||
if (!Imm)
|
||||
return true;
|
||||
|
||||
assert(Imm->isImm());
|
||||
return (Imm->getImm() & Mask) == Value;
|
||||
}
|
||||
|
||||
bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
|
||||
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
|
||||
auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
|
||||
assert(BCZOpnd && BCZOpnd->isImm());
|
||||
bool BoundCtrlZero = 0 != BCZOpnd->getImm();
|
||||
|
||||
LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
|
||||
|
||||
auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
|
||||
assert(OldOpnd && OldOpnd->isReg());
|
||||
auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
|
||||
auto *OldOpndValue = getOldOpndValue(*OldOpnd);
|
||||
assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
|
||||
if (OldOpndValue) {
|
||||
if (BoundCtrlZero) {
|
||||
OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
|
||||
OldOpndValue = nullptr;
|
||||
} else {
|
||||
if (!OldOpndValue->isImm()) {
|
||||
LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n");
|
||||
return false;
|
||||
}
|
||||
if (OldOpndValue->getImm() == 0) {
|
||||
OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
|
||||
OldOpndValue = nullptr;
|
||||
BoundCtrlZero = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << " old=";
|
||||
if (!OldOpndValue)
|
||||
dbgs() << "undef";
|
||||
else
|
||||
dbgs() << OldOpndValue->getImm();
|
||||
dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
|
||||
|
||||
std::vector<MachineInstr*> OrigMIs, DPPMIs;
|
||||
if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
|
||||
OldOpndVGPR = RegSubRegPair(
|
||||
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
|
||||
auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
|
||||
TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
|
||||
DPPMIs.push_back(UndefInst.getInstr());
|
||||
}
|
||||
|
||||
OrigMIs.push_back(&MovMI);
|
||||
bool Rollback = true;
|
||||
for (auto &Use : MRI->use_nodbg_operands(
|
||||
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
|
||||
Rollback = true;
|
||||
|
||||
auto &OrigMI = *Use.getParent();
|
||||
auto OrigOp = OrigMI.getOpcode();
|
||||
if (TII->isVOP3(OrigOp)) {
|
||||
if (!TII->hasVALU32BitEncoding(OrigOp)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
|
||||
break;
|
||||
}
|
||||
// check if other than abs|neg modifiers are set (opsel for example)
|
||||
const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
|
||||
if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
|
||||
!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
|
||||
!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
|
||||
!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
|
||||
break;
|
||||
}
|
||||
} else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
|
||||
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
|
||||
break;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
|
||||
if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
|
||||
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
|
||||
OldOpndValue, BoundCtrlZero)) {
|
||||
DPPMIs.push_back(DPPInst);
|
||||
Rollback = false;
|
||||
}
|
||||
} else if (OrigMI.isCommutable() &&
|
||||
&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
|
||||
auto *BB = OrigMI.getParent();
|
||||
auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
|
||||
BB->insert(OrigMI, NewMI);
|
||||
if (TII->commuteInstruction(*NewMI)) {
|
||||
LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
|
||||
if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
|
||||
OldOpndValue, BoundCtrlZero)) {
|
||||
DPPMIs.push_back(DPPInst);
|
||||
Rollback = false;
|
||||
}
|
||||
} else
|
||||
LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
|
||||
NewMI->eraseFromParent();
|
||||
} else
|
||||
LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
|
||||
if (Rollback)
|
||||
break;
|
||||
OrigMIs.push_back(&OrigMI);
|
||||
}
|
||||
|
||||
for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
|
||||
MI->eraseFromParent();
|
||||
|
||||
return !Rollback;
|
||||
}
|
||||
|
||||
bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
|
||||
auto &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
if (!ST.hasDPP() || skipFunction(MF.getFunction()))
|
||||
return false;
|
||||
|
||||
MRI = &MF.getRegInfo();
|
||||
TII = ST.getInstrInfo();
|
||||
|
||||
assert(MRI->isSSA() && "Must be run on SSA");
|
||||
|
||||
bool Changed = false;
|
||||
for (auto &MBB : MF) {
|
||||
for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
|
||||
auto &MI = *I++;
|
||||
if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
|
||||
Changed = true;
|
||||
++NumDPPMovsCombined;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
|
@ -5632,3 +5632,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
|
|||
|
||||
return MCOp;
|
||||
}
|
||||
|
||||
static
|
||||
TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
|
||||
assert(RegOpnd.isReg());
|
||||
return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
|
||||
getRegSubRegPair(RegOpnd);
|
||||
}
|
||||
|
||||
TargetInstrInfo::RegSubRegPair
|
||||
llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
|
||||
assert(MI.isRegSequence());
|
||||
for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
|
||||
if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
|
||||
auto &RegOp = MI.getOperand(1 + 2 * I);
|
||||
return getRegOrUndef(RegOp);
|
||||
}
|
||||
return TargetInstrInfo::RegSubRegPair();
|
||||
}
|
||||
|
||||
// Try to find the definition of reg:subreg in subreg-manipulation pseudos
|
||||
// Following a subreg of reg:subreg isn't supported
|
||||
static bool followSubRegDef(MachineInstr &MI,
|
||||
TargetInstrInfo::RegSubRegPair &RSR) {
|
||||
if (!RSR.SubReg)
|
||||
return false;
|
||||
switch (MI.getOpcode()) {
|
||||
default: break;
|
||||
case AMDGPU::REG_SEQUENCE:
|
||||
RSR = getRegSequenceSubReg(MI, RSR.SubReg);
|
||||
return true;
|
||||
// EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
|
||||
case AMDGPU::INSERT_SUBREG:
|
||||
if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
|
||||
// inserted the subreg we're looking for
|
||||
RSR = getRegOrUndef(MI.getOperand(2));
|
||||
else { // the subreg in the rest of the reg
|
||||
auto R1 = getRegOrUndef(MI.getOperand(1));
|
||||
if (R1.SubReg) // subreg of subreg isn't supported
|
||||
return false;
|
||||
RSR.Reg = R1.Reg;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
|
||||
MachineRegisterInfo &MRI) {
|
||||
assert(MRI.isSSA());
|
||||
if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
|
||||
return nullptr;
|
||||
|
||||
auto RSR = P;
|
||||
auto *DefInst = MRI.getVRegDef(RSR.Reg);
|
||||
while (auto *MI = DefInst) {
|
||||
DefInst = nullptr;
|
||||
switch (MI->getOpcode()) {
|
||||
case AMDGPU::COPY:
|
||||
case AMDGPU::V_MOV_B32_e32: {
|
||||
auto &Op1 = MI->getOperand(1);
|
||||
if (Op1.isReg() &&
|
||||
TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
|
||||
if (Op1.isUndef())
|
||||
return nullptr;
|
||||
RSR = getRegSubRegPair(Op1);
|
||||
DefInst = MRI.getVRegDef(RSR.Reg);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
if (followSubRegDef(*MI, RSR)) {
|
||||
if (!RSR.Reg)
|
||||
return nullptr;
|
||||
DefInst = MRI.getVRegDef(RSR.Reg);
|
||||
}
|
||||
}
|
||||
if (!DefInst)
|
||||
return MI;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -917,9 +917,36 @@ public:
|
|||
/// Return -1 if the target-specific opcode for the pseudo instruction does
|
||||
/// not exist. If Opcode is not a pseudo instruction, this is identity.
|
||||
int pseudoToMCOpcode(int Opcode) const;
|
||||
|
||||
};
|
||||
|
||||
/// \brief Returns true if a reg:subreg pair P has a TRC class
|
||||
inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P,
|
||||
const TargetRegisterClass &TRC,
|
||||
MachineRegisterInfo &MRI) {
|
||||
auto *RC = MRI.getRegClass(P.Reg);
|
||||
if (!P.SubReg)
|
||||
return RC == &TRC;
|
||||
auto *TRI = MRI.getTargetRegisterInfo();
|
||||
return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg);
|
||||
}
|
||||
|
||||
/// \brief Create RegSubRegPair from a register MachineOperand
|
||||
inline
|
||||
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) {
|
||||
assert(O.isReg());
|
||||
return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg());
|
||||
}
|
||||
|
||||
/// \brief Return the SubReg component from REG_SEQUENCE
|
||||
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
|
||||
unsigned SubReg);
|
||||
|
||||
/// \brief Return the defining instruction for a given reg:subreg pair
|
||||
/// skipping copy like instructions and subreg-manipulation pseudos.
|
||||
/// Following another subreg of a reg:subreg isn't supported.
|
||||
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
|
||||
MachineRegisterInfo &MRI);
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
LLVM_READONLY
|
||||
|
@ -931,6 +958,9 @@ namespace AMDGPU {
|
|||
LLVM_READONLY
|
||||
int getSDWAOp(uint16_t Opcode);
|
||||
|
||||
LLVM_READONLY
|
||||
int getDPPOp32(uint16_t Opcode);
|
||||
|
||||
LLVM_READONLY
|
||||
int getBasicFromSDWAOp(uint16_t Opcode);
|
||||
|
||||
|
|
|
@ -1622,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
|
|||
0, // 64-bit dst - No DPP or SDWA for 64-bit operands
|
||||
!if(!eq(Src0VT.Size, 64),
|
||||
0, // 64-bit src0
|
||||
!if(!eq(Src0VT.Size, 64),
|
||||
!if(!eq(Src1VT.Size, 64),
|
||||
0, // 64-bit src2
|
||||
1
|
||||
)
|
||||
|
@ -1631,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
|
|||
);
|
||||
}
|
||||
|
||||
class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
|
||||
ValueType Src1VT = i32> {
|
||||
bit ret = !if(!eq(NumSrcArgs, 0), 0,
|
||||
getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
|
||||
}
|
||||
|
||||
class BitOr<bit a, bit b> {
|
||||
bit ret = !if(a, 1, !if(b, 1, 0));
|
||||
}
|
||||
|
@ -1710,7 +1716,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
|
|||
field bit HasSDWAOMod = isFloatType<DstVT>.ret;
|
||||
|
||||
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
|
||||
field bit HasExtDPP = HasExt;
|
||||
field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
|
||||
field bit HasExtSDWA = HasExt;
|
||||
field bit HasExtSDWA9 = HasExt;
|
||||
field int NeedPatGen = PatGenMode.NoPattern;
|
||||
|
@ -1741,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
|
|||
getOpSelMod<Src0VT>.ret,
|
||||
getOpSelMod<Src1VT>.ret,
|
||||
getOpSelMod<Src2VT>.ret>.ret;
|
||||
field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
|
||||
HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
|
||||
field dag InsDPP = !if(HasExtDPP,
|
||||
getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
|
||||
HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
|
||||
(ins));
|
||||
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
|
||||
HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
|
||||
DstVT>.ret;
|
||||
|
@ -1756,7 +1764,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
|
|||
HasSrc0FloatMods,
|
||||
HasSrc1FloatMods,
|
||||
HasSrc2FloatMods>.ret;
|
||||
field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
|
||||
field string AsmDPP = !if(HasExtDPP,
|
||||
getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
|
||||
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
|
||||
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
|
||||
}
|
||||
|
@ -1931,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping {
|
|||
let ValueCols = [["Default"]];
|
||||
}
|
||||
|
||||
// Maps ordinary instructions to their DPP counterparts
|
||||
def getDPPOp32 : InstrMapping {
|
||||
let FilterClass = "VOP";
|
||||
let RowFields = ["OpName"];
|
||||
let ColFields = ["AsmVariantName"];
|
||||
let KeyCol = ["Default"];
|
||||
let ValueCols = [["DPP"]];
|
||||
}
|
||||
|
||||
// Maps an commuted opcode to its original version
|
||||
def getCommuteOrig : InstrMapping {
|
||||
let FilterClass = "Commutable_REV";
|
||||
|
|
|
@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
|
|||
let AsmMatchConverter = "cvtSdwaVOP1";
|
||||
}
|
||||
|
||||
class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
|
||||
VOP_DPP_Pseudo <OpName, P, pattern> {
|
||||
}
|
||||
|
||||
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
|
||||
list<dag> ret =
|
||||
!if(P.HasModifiers,
|
||||
|
@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P,
|
|||
def _e32 : VOP1_Pseudo <opName, P>;
|
||||
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
|
||||
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
|
||||
foreach _ = BoolToList<P.HasExtDPP>.ret in
|
||||
def _dpp : VOP1_DPP_Pseudo <opName, P>;
|
||||
}
|
||||
|
||||
// Special profile for instructions which have clamp
|
||||
|
@ -500,13 +506,8 @@ defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
|
|||
// VI
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
|
||||
VOP_DPP <ps.OpName, P> {
|
||||
let Defs = ps.Defs;
|
||||
let Uses = ps.Uses;
|
||||
let SchedRW = ps.SchedRW;
|
||||
let hasSideEffects = ps.hasSideEffects;
|
||||
|
||||
class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
|
||||
VOP_DPPe <P> {
|
||||
bits<8> vdst;
|
||||
let Inst{8-0} = 0xfa; // dpp
|
||||
let Inst{16-9} = op;
|
||||
|
@ -544,9 +545,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
|
|||
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
|
||||
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
|
||||
|
||||
// For now left dpp only for asm/dasm
|
||||
// TODO: add corresponding pseudo
|
||||
def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
|
||||
foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
|
||||
def _dpp_vi :
|
||||
VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
|
||||
VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
|
||||
}
|
||||
|
||||
defm V_NOP : VOP1_Real_vi <0x0>;
|
||||
|
@ -717,9 +719,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
|
|||
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
|
||||
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
|
||||
|
||||
// For now left dpp only for asm/dasm
|
||||
// TODO: add corresponding pseudo
|
||||
def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
|
||||
foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
|
||||
def _dpp_gfx9 :
|
||||
VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
|
||||
VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
|
||||
|
||||
}
|
||||
|
||||
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
|
||||
|
|
|
@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
|
|||
let AsmMatchConverter = "cvtSdwaVOP2";
|
||||
}
|
||||
|
||||
class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
|
||||
VOP_DPP_Pseudo <OpName, P, pattern> {
|
||||
}
|
||||
|
||||
|
||||
class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
|
||||
list<dag> ret = !if(P.HasModifiers,
|
||||
[(set P.DstVT:$vdst,
|
||||
|
@ -155,7 +160,12 @@ multiclass VOP2Inst<string opName,
|
|||
bit GFX9Renamed = 0> :
|
||||
VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
|
||||
VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
|
||||
VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed>;
|
||||
VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> {
|
||||
let renamedInGFX9 = GFX9Renamed in {
|
||||
foreach _ = BoolToList<P.HasExtDPP>.ret in
|
||||
def _dpp : VOP2_DPP_Pseudo <opName, P>;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP2bInst <string opName,
|
||||
VOPProfile P,
|
||||
|
@ -172,6 +182,8 @@ multiclass VOP2bInst <string opName,
|
|||
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
|
||||
let AsmMatchConverter = "cvtSdwaVOP2b";
|
||||
}
|
||||
foreach _ = BoolToList<P.HasExtDPP>.ret in
|
||||
def _dpp : VOP2_DPP_Pseudo <opName, P>;
|
||||
}
|
||||
|
||||
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
|
||||
|
@ -194,6 +206,9 @@ multiclass VOP2eInst <string opName,
|
|||
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
|
||||
let AsmMatchConverter = "cvtSdwaVOP2b";
|
||||
}
|
||||
|
||||
foreach _ = BoolToList<P.HasExtDPP>.ret in
|
||||
def _dpp : VOP2_DPP_Pseudo <opName, P>;
|
||||
}
|
||||
|
||||
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
|
||||
|
@ -233,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
|
|||
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
|
||||
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
|
||||
0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
|
||||
let InsDPP = (ins DstRCDPP:$old,
|
||||
Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
|
||||
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
|
||||
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
|
||||
VGPR_32:$src2, // stub argument
|
||||
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
|
||||
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
|
||||
|
||||
|
@ -778,13 +793,8 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
|
|||
// VI
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> :
|
||||
VOP_DPP <OpName, P> {
|
||||
let Defs = ps.Defs;
|
||||
let Uses = ps.Uses;
|
||||
let SchedRW = ps.SchedRW;
|
||||
let hasSideEffects = ps.hasSideEffects;
|
||||
|
||||
class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
|
||||
VOP_DPPe <P> {
|
||||
bits<8> vdst;
|
||||
bits<8> src1;
|
||||
let Inst{8-0} = 0xfa; //dpp
|
||||
|
@ -865,8 +875,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
|
|||
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
|
||||
let AsmString = AsmName # ps.AsmOperands;
|
||||
}
|
||||
def _dpp :
|
||||
VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>;
|
||||
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
|
||||
def _dpp_vi :
|
||||
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
|
||||
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
|
||||
VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
|
||||
let AsmString = AsmName # ps.AsmOperands;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -893,10 +908,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
|
|||
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
|
||||
let AsmString = AsmName # ps.AsmOperands;
|
||||
}
|
||||
def _dpp_gfx9 :
|
||||
VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> {
|
||||
let DecoderNamespace = "SDWA9";
|
||||
}
|
||||
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
|
||||
def _dpp_gfx9 :
|
||||
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
|
||||
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
|
||||
VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
|
||||
let AsmString = AsmName # ps.AsmOperands;
|
||||
let DecoderNamespace = "SDWA9";
|
||||
}
|
||||
}
|
||||
|
||||
multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
|
||||
|
@ -914,19 +933,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
|
|||
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
|
||||
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
|
||||
}
|
||||
def _dpp_gfx9 :
|
||||
VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
|
||||
let DecoderNamespace = "SDWA9";
|
||||
}
|
||||
foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
|
||||
def _dpp_gfx9 :
|
||||
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
|
||||
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
|
||||
let DecoderNamespace = "SDWA9";
|
||||
}
|
||||
}
|
||||
|
||||
} // AssemblerPredicates = [isGFX9]
|
||||
|
||||
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
|
||||
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
|
||||
// For now left dpp only for asm/dasm
|
||||
// TODO: add corresponding pseudo
|
||||
def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
|
||||
|
||||
foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
|
||||
def _dpp_vi :
|
||||
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
|
||||
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
|
||||
}
|
||||
|
||||
defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>;
|
||||
|
|
|
@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 {
|
|||
let Inst{63-60} = row_mask;
|
||||
}
|
||||
|
||||
class VOP_DPP <string OpName, VOPProfile P> :
|
||||
InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
|
||||
VOP_DPPe<P> {
|
||||
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
|
||||
InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
|
||||
VOP <OpName>,
|
||||
SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
|
||||
MnemonicAlias <OpName#"_dpp", OpName> {
|
||||
|
||||
let isPseudo = 1;
|
||||
let isCodeGenOnly = 1;
|
||||
|
||||
let mayLoad = 0;
|
||||
let mayStore = 0;
|
||||
|
@ -517,6 +522,11 @@ class VOP_DPP <string OpName, VOPProfile P> :
|
|||
let VALU = 1;
|
||||
let DPP = 1;
|
||||
let Size = 8;
|
||||
let Uses = [EXEC];
|
||||
let isConvergent = 1;
|
||||
|
||||
string Mnemonic = OpName;
|
||||
string AsmOperands = P.AsmDPP;
|
||||
|
||||
let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
|
||||
let SubtargetPredicate = HasDPP;
|
||||
|
@ -526,6 +536,36 @@ class VOP_DPP <string OpName, VOPProfile P> :
|
|||
let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
|
||||
let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
|
||||
let DecoderNamespace = "DPP";
|
||||
|
||||
VOPProfile Pfl = P;
|
||||
}
|
||||
|
||||
class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
|
||||
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
|
||||
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
|
||||
|
||||
let isPseudo = 0;
|
||||
let isCodeGenOnly = 0;
|
||||
|
||||
let Defs = ps.Defs;
|
||||
let Uses = ps.Uses;
|
||||
let SchedRW = ps.SchedRW;
|
||||
let hasSideEffects = ps.hasSideEffects;
|
||||
|
||||
let Constraints = ps.Constraints;
|
||||
let DisableEncoding = ps.DisableEncoding;
|
||||
|
||||
// Copy relevant pseudo op flags
|
||||
let isConvergent = ps.isConvergent;
|
||||
let SubtargetPredicate = ps.SubtargetPredicate;
|
||||
let AssemblerPredicate = ps.AssemblerPredicate;
|
||||
let AsmMatchConverter = ps.AsmMatchConverter;
|
||||
let AsmVariantName = ps.AsmVariantName;
|
||||
let UseNamedOperandTable = ps.UseNamedOperandTable;
|
||||
let DecoderNamespace = ps.DecoderNamespace;
|
||||
let Constraints = ps.Constraints;
|
||||
let DisableEncoding = ps.DisableEncoding;
|
||||
let TSFlags = ps.TSFlags;
|
||||
}
|
||||
|
||||
class getNumNodeArgs<SDPatternOperator Op> {
|
||||
|
|
|
@ -0,0 +1,185 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; VOP2 with literal cannot be combined
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_literal:
|
||||
; CHECK: v_mov_b32_dpp [[OLD:v[0-9]+]], {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x1 bound_ctrl:0
|
||||
; CHECK: v_add_u32_e32 {{v[0-9]+}}, vcc, 42, [[OLD]]
|
||||
define amdgpu_kernel void @dpp_combine_i32_literal(i32 addrspace(1)* %out, i32 %in) {
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 2, i32 1, i1 1) #0
|
||||
%res = add nsw i32 %dpp, 42
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_bz:
|
||||
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
define amdgpu_kernel void @dpp_combine_i32_bz(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
|
||||
%res = add nsw i32 %dpp, %x
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_undef:
|
||||
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
define amdgpu_kernel void @dpp_combine_i32_boff_undef(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
|
||||
%res = add nsw i32 %dpp, %x
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_0:
|
||||
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
define amdgpu_kernel void @dpp_combine_i32_boff_0(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
|
||||
%res = add nsw i32 %dpp, %x
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_max:
|
||||
; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], -2
|
||||
; CHECK: v_max_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
define amdgpu_kernel void @dpp_combine_i32_boff_max(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 2147483647, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
|
||||
%cmp = icmp sge i32 %dpp, %x
|
||||
%res = select i1 %cmp, i32 %dpp, i32 %x
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_min:
|
||||
; CHECK: v_bfrev_b32_e32 [[OLD:v[0-9]+]], 1
|
||||
; CHECK: v_min_i32_dpp [[OLD]], {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
define amdgpu_kernel void @dpp_combine_i32_boff_min(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
|
||||
%cmp = icmp sle i32 %dpp, %x
|
||||
%res = select i1 %cmp, i32 %dpp, i32 %x
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_boff_mul:
|
||||
; CHECK: v_mul_i32_i24_dpp v0, v3, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
|
||||
define amdgpu_kernel void @dpp_combine_i32_boff_mul(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 1, i32 %in, i32 1, i32 1, i32 1, i1 0) #0
|
||||
|
||||
%dpp.shl = shl i32 %dpp, 8
|
||||
%dpp.24 = ashr i32 %dpp.shl, 8
|
||||
%x.shl = shl i32 %x, 8
|
||||
%x.24 = ashr i32 %x.shl, 8
|
||||
%res = mul i32 %dpp.24, %x.24
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_i32_commute:
|
||||
; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
define amdgpu_kernel void @dpp_combine_i32_commute(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 2, i32 1, i32 1, i1 1) #0
|
||||
%res = sub nsw i32 %x, %dpp
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_f32:
|
||||
; CHECK: v_add_f32_dpp {{v[0-9]+}}, {{v[0-9]+}}, v0 quad_perm:[3,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
define amdgpu_kernel void @dpp_combine_f32(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 3, i32 1, i32 1, i1 1) #0
|
||||
%dpp.f32 = bitcast i32 %dpp to float
|
||||
%x.f32 = bitcast i32 %x to float
|
||||
%res.f32 = fadd float %x.f32, %dpp.f32
|
||||
%res = bitcast float %res.f32 to i32
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_test_f32_mods:
|
||||
; CHECK: v_mul_f32_dpp {{v[0-9]+}}, |{{v[0-9]+}}|, -v0 quad_perm:[0,1,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
define amdgpu_kernel void @dpp_combine_test_f32_mods(i32 addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 4, i32 1, i32 1, i1 1) #0
|
||||
|
||||
%x.f32 = bitcast i32 %x to float
|
||||
%x.f32.neg = fsub float -0.000000e+00, %x.f32
|
||||
|
||||
%dpp.f32 = bitcast i32 %dpp to float
|
||||
%dpp.f32.cmp = fcmp fast olt float %dpp.f32, 0.000000e+00
|
||||
%dpp.f32.sign = select i1 %dpp.f32.cmp, float -1.000000e+00, float 1.000000e+00
|
||||
%dpp.f32.abs = fmul fast float %dpp.f32, %dpp.f32.sign
|
||||
|
||||
%res.f32 = fmul float %x.f32.neg, %dpp.f32.abs
|
||||
%res = bitcast float %res.f32 to i32
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_mac:
|
||||
; CHECK: v_mac_f32_dpp v0, {{v[0-9]+}}, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
define amdgpu_kernel void @dpp_combine_mac(float addrspace(1)* %out, i32 %in) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%y = tail call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
|
||||
%dpp.f32 = bitcast i32 %dpp to float
|
||||
%x.f32 = bitcast i32 %x to float
|
||||
%y.f32 = bitcast i32 %y to float
|
||||
|
||||
%mult = fmul float %dpp.f32, %y.f32
|
||||
%res = fadd float %mult, %x.f32
|
||||
store float %res, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_sequence:
|
||||
define amdgpu_kernel void @dpp_combine_sequence(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
bb1:
|
||||
; CHECK: v_add_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
%resadd = add nsw i32 %dpp, %x
|
||||
br label %bb3
|
||||
bb2:
|
||||
; CHECK: v_subrev_u32_dpp {{v[0-9]+}}, vcc, {{v[0-9]+}}, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
%ressub = sub nsw i32 %x, %dpp
|
||||
br label %bb3
|
||||
bb3:
|
||||
%res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}dpp_combine_sequence_negative:
|
||||
; CHECK: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
|
||||
define amdgpu_kernel void @dpp_combine_sequence_negative(i32 addrspace(1)* %out, i32 %in, i1 %cmp) {
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in, i32 1, i32 1, i32 1, i1 1) #0
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
bb1:
|
||||
%resadd = add nsw i32 %dpp, %x
|
||||
br label %bb3
|
||||
bb2:
|
||||
%ressub = sub nsw i32 2, %dpp ; break seq
|
||||
br label %bb3
|
||||
bb3:
|
||||
%res = phi i32 [%resadd, %bb1], [%ressub, %bb2]
|
||||
store i32 %res, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
declare i32 @llvm.amdgcn.workitem.id.y()
|
||||
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
|
||||
|
||||
attributes #0 = { nounwind readnone convergent }
|
|
@ -0,0 +1,143 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=gcn-dpp-combine -o - %s | FileCheck %s
|
||||
|
||||
# test if $old definition is correctly tracked through subreg manipulation pseudos
|
||||
|
||||
---
|
||||
# CHECK-LABEL: name: mul_old_subreg
|
||||
# CHECK: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
|
||||
|
||||
name: mul_old_subreg
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vreg_64 }
|
||||
- { id: 1, class: vgpr_32 }
|
||||
- { id: 2, class: vgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: vreg_64 }
|
||||
- { id: 5, class: vreg_64 }
|
||||
- { id: 6, class: vgpr_32 }
|
||||
- { id: 7, class: vgpr_32 }
|
||||
|
||||
liveins:
|
||||
- { reg: '$vgpr0', virtual-reg: '%0' }
|
||||
- { reg: '$vgpr1', virtual-reg: '%1' }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
|
||||
%3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
|
||||
%4 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
|
||||
%5 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4
|
||||
%6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec
|
||||
%7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec
|
||||
...
|
||||
|
||||
# CHECK-LABEL: name: add_old_subreg
|
||||
# CHECK: [[OLD:\%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
# CHECK: %5:vgpr_32 = V_ADD_U32_dpp [[OLD]], %1, %0.sub1, 1, 1, 1, 1, implicit $exec
|
||||
|
||||
name: add_old_subreg
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vreg_64 }
|
||||
- { id: 1, class: vgpr_32 }
|
||||
- { id: 2, class: vgpr_32 }
|
||||
- { id: 3, class: vreg_64 }
|
||||
- { id: 4, class: vgpr_32 }
|
||||
- { id: 5, class: vgpr_32 }
|
||||
|
||||
liveins:
|
||||
- { reg: '$vgpr0', virtual-reg: '%0' }
|
||||
- { reg: '$vgpr1', virtual-reg: '%1' }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
|
||||
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
|
||||
...
|
||||
|
||||
# CHECK-LABEL: name: add_old_subreg_undef
|
||||
# CHECK: %5:vgpr_32 = V_ADD_U32_dpp %3.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec
|
||||
|
||||
name: add_old_subreg_undef
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vreg_64 }
|
||||
- { id: 1, class: vgpr_32 }
|
||||
- { id: 2, class: vgpr_32 }
|
||||
- { id: 3, class: vreg_64 }
|
||||
- { id: 4, class: vgpr_32 }
|
||||
- { id: 5, class: vgpr_32 }
|
||||
|
||||
liveins:
|
||||
- { reg: '$vgpr0', virtual-reg: '%0' }
|
||||
- { reg: '$vgpr1', virtual-reg: '%1' }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vreg_64 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef
|
||||
%4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec
|
||||
%5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec
|
||||
...
|
||||
|
||||
# CHECK-LABEL: name: add_f32_e64
|
||||
# CHECK: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
|
||||
# CHECK: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
|
||||
# CHECK: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 1, 1, 1, implicit $exec
|
||||
# CHECK: %7:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 1, 1, 1, implicit $exec
|
||||
# CHECK: %9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
|
||||
|
||||
name: add_f32_e64
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vgpr_32 }
|
||||
- { id: 1, class: vgpr_32 }
|
||||
- { id: 2, class: vgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: vgpr_32 }
|
||||
- { id: 5, class: vgpr_32 }
|
||||
- { id: 6, class: vgpr_32 }
|
||||
- { id: 7, class: vgpr_32 }
|
||||
- { id: 8, class: vgpr_32 }
|
||||
- { id: 9, class: vgpr_32 }
|
||||
|
||||
liveins:
|
||||
- { reg: '$vgpr0', virtual-reg: '%0' }
|
||||
- { reg: '$vgpr1', virtual-reg: '%1' }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1
|
||||
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:vgpr_32 = COPY $vgpr1
|
||||
%2:vgpr_32 = IMPLICIT_DEF
|
||||
%3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
|
||||
|
||||
; this shouldn't be combined as omod is set
|
||||
%4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
|
||||
|
||||
%5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
|
||||
|
||||
; this should be combined as all modifiers are default
|
||||
%6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $exec
|
||||
|
||||
; this should be combined as modifiers other than abs|neg are default
|
||||
%7:vgpr_32 = V_ADD_F32_e64 1, %5, 2, %0, 0, 0, implicit $exec
|
||||
|
||||
%8:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 1, 1, 1, implicit $exec
|
||||
|
||||
; this shouldn't be combined as modifiers aren't abs|neg
|
||||
%9:vgpr_32 = V_ADD_F32_e64 4, %8, 8, %0, 0, 0, implicit $exec
|
||||
...
|
|
@ -116,7 +116,6 @@ v_add_f32 v0, |v0|, -v0 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// NOSICI: error:
|
||||
// VI9: v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x00,0x00,0x7e,0x00,0x01,0x09,0xa1]
|
||||
v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
|
||||
|
||||
// NOSICI: error:
|
||||
|
|
Loading…
Reference in New Issue