[AMDGPU] SDWA Peephole: improve search for immediates in SDWA patterns

Previously compiler often extracted common immediates into specific register, e.g.:
```
%vreg0 = S_MOV_B32 0xff;
%vreg2 = V_AND_B32_e32 %vreg0, %vreg1
%vreg4 = V_AND_B32_e32 %vreg0, %vreg3
```
Because of this SDWA peephole failed to find SDWA convertible pattern. E.g. in previous example this could be converted into 2 SDWA src operands:
```
SDWA src: %vreg2 src_sel:BYTE_0
SDWA src: %vreg4 src_sel:BYTE_0
```
With this change peephole check if operand is either immediate or register that is copy of immediate.

llvm-svn: 299202
This commit is contained in:
Sam Kolton 2017-03-31 11:42:43 +00:00
parent 37b536e4b3
commit 27e0f8bc72
5 changed files with 112 additions and 68 deletions

View File

@ -139,27 +139,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands(); return new SIFoldOperands();
} }
static bool isFoldableCopy(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO: {
// If there are additional implicit register operands, this may be used for
// register indexing so the source register operand isn't simply copied.
unsigned NumOps = MI.getDesc().getNumOperands() +
MI.getDesc().getNumImplicitUses();
return MI.getNumOperands() == NumOps;
}
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
return true;
default:
return false;
}
}
static bool updateOperand(FoldCandidate &Fold, static bool updateOperand(FoldCandidate &Fold,
const TargetRegisterInfo &TRI) { const TargetRegisterInfo &TRI) {
MachineInstr *MI = Fold.UseMI; MachineInstr *MI = Fold.UseMI;
@ -936,7 +915,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
tryFoldInst(TII, &MI); tryFoldInst(TII, &MI);
if (!isFoldableCopy(MI)) { if (!TII->isFoldableCopy(MI)) {
if (IsIEEEMode || !tryFoldOMod(MI)) if (IsIEEEMode || !tryFoldOMod(MI))
tryFoldClamp(MI); tryFoldClamp(MI);
continue; continue;

View File

@ -1488,6 +1488,27 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
} }
} }
bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO: {
// If there are additional implicit register operands, this may be used for
// register indexing so the source register operand isn't simply copied.
unsigned NumOps = MI.getDesc().getNumOperands() +
MI.getDesc().getNumImplicitUses();
return MI.getNumOperands() == NumOps;
}
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
return true;
default:
return false;
}
}
static void removeModOperands(MachineInstr &MI) { static void removeModOperands(MachineInstr &MI) {
unsigned Opc = MI.getOpcode(); unsigned Opc = MI.getOpcode();
int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,

View File

@ -222,6 +222,8 @@ public:
areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override; AliasAnalysis *AA = nullptr) const override;
bool isFoldableCopy(const MachineInstr &MI) const;
bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
MachineRegisterInfo *MRI) const final; MachineRegisterInfo *MRI) const final;

View File

@ -51,6 +51,8 @@ private:
std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
Optional<int64_t> foldToImm(const MachineOperand &Op) const;
public: public:
static char ID; static char ID;
@ -375,6 +377,33 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true; return true;
} }
Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
if (Op.isImm()) {
return Op.getImm();
}
// If this is not immediate then it can be copy of immediate value, e.g.:
// %vreg1<def> = S_MOV_B32 255;
if (Op.isReg()) {
for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
if (!isSameReg(Op, Def))
continue;
const MachineInstr *DefInst = Def.getParent();
if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst))
return None;
const MachineOperand &Copied = DefInst->getOperand(1);
if (!Copied.isImm())
return None;
return Copied.getImm();
}
}
return None;
}
void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
for (MachineInstr &MI : MBB) { for (MachineInstr &MI : MBB) {
unsigned Opcode = MI.getOpcode(); unsigned Opcode = MI.getOpcode();
@ -391,11 +420,11 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
// from: v_lshlrev_b32_e32 v1, 16/24, v0 // from: v_lshlrev_b32_e32 v1, 16/24, v0
// to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (!Src0->isImm()) auto Imm = foldToImm(*Src0);
if (!Imm)
break; break;
int64_t Imm = Src0->getImm(); if (*Imm != 16 && *Imm != 24)
if (Imm != 16 && Imm != 24)
break; break;
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
@ -406,13 +435,13 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
if (Opcode == AMDGPU::V_LSHLREV_B32_e32) { if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
auto SDWADst = make_unique<SDWADstOperand>( auto SDWADst = make_unique<SDWADstOperand>(
Dst, Src1, Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
SDWAOperands[&MI] = std::move(SDWADst); SDWAOperands[&MI] = std::move(SDWADst);
++NumSDWAPatternsFound; ++NumSDWAPatternsFound;
} else { } else {
auto SDWASrc = make_unique<SDWASrcOperand>( auto SDWASrc = make_unique<SDWASrcOperand>(
Src1, Dst, Imm == 16 ? WORD_1 : BYTE_3, false, false, Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true); Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
SDWAOperands[&MI] = std::move(SDWASrc); SDWAOperands[&MI] = std::move(SDWASrc);
@ -433,7 +462,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
// from: v_lshlrev_b16_e32 v1, 8, v0 // from: v_lshlrev_b16_e32 v1, 8, v0
// to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (!Src0->isImm() || Src0->getImm() != 8) auto Imm = foldToImm(*Src0);
if (!Imm || *Imm != 8)
break; break;
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
@ -477,30 +507,30 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
// 24 | 8 | BYTE_3 // 24 | 8 | BYTE_3
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isImm()) auto Offset = foldToImm(*Src1);
if (!Offset)
break; break;
int64_t Offset = Src1->getImm();
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (!Src2->isImm()) auto Width = foldToImm(*Src2);
if (!Width)
break; break;
int64_t Width = Src2->getImm();
SdwaSel SrcSel = DWORD; SdwaSel SrcSel = DWORD;
if (Offset == 0 && Width == 8) if (*Offset == 0 && *Width == 8)
SrcSel = BYTE_0; SrcSel = BYTE_0;
else if (Offset == 0 && Width == 16) else if (*Offset == 0 && *Width == 16)
SrcSel = WORD_0; SrcSel = WORD_0;
else if (Offset == 0 && Width == 32) else if (*Offset == 0 && *Width == 32)
SrcSel = DWORD; SrcSel = DWORD;
else if (Offset == 8 && Width == 8) else if (*Offset == 8 && *Width == 8)
SrcSel = BYTE_1; SrcSel = BYTE_1;
else if (Offset == 16 && Width == 8) else if (*Offset == 16 && *Width == 8)
SrcSel = BYTE_2; SrcSel = BYTE_2;
else if (Offset == 16 && Width == 16) else if (*Offset == 16 && *Width == 16)
SrcSel = WORD_1; SrcSel = WORD_1;
else if (Offset == 24 && Width == 8) else if (*Offset == 24 && *Width == 8)
SrcSel = BYTE_3; SrcSel = BYTE_3;
else else
break; break;
@ -526,11 +556,11 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
// to SDWA src:v0 src_sel:WORD_0/BYTE_0 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (!Src0->isImm()) auto Imm = foldToImm(*Src0);
if (!Imm)
break; break;
int64_t Imm = Src0->getImm(); if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
if (Imm != 0x0000ffff && Imm != 0x000000ff)
break; break;
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
@ -541,7 +571,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
break; break;
auto SDWASrc = make_unique<SDWASrcOperand>( auto SDWASrc = make_unique<SDWASrcOperand>(
Src1, Dst, Imm == 0x0000ffff ? WORD_0 : BYTE_0); Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
SDWAOperands[&MI] = std::move(SDWASrc); SDWAOperands[&MI] = std::move(SDWASrc);
++NumSDWAPatternsFound; ++NumSDWAPatternsFound;

View File

@ -72,8 +72,9 @@ entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa ; NOSDWA-NOT: v_mul_u32_u24_sdwa
; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
entry: entry:
@ -92,10 +93,12 @@ entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa ; NOSDWA-NOT: v_mul_u32_u24_sdwa
; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
entry: entry:
@ -114,14 +117,18 @@ entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa ; NOSDWA-NOT: v_mul_u32_u24_sdwa
; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}} ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}} ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
entry: entry:
@ -155,7 +162,9 @@ entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
; NOSDWA-NOT: v_mul_f16_sdwa ; NOSDWA-NOT: v_mul_f16_sdwa
; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
entry: entry:
@ -176,7 +185,8 @@ entry:
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
entry: entry:
@ -199,10 +209,10 @@ entry:
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
entry: entry:
@ -347,26 +357,28 @@ entry:
ret void ret void
} }
; GCN-LABEL: {{^}}mul_add_v2i16: ; GCN-LABEL: {{^}}mul_add_shr_i32:
; NOSDWA-NOT: v_mul_u32_u24_sdwa ; NOSDWA-NOT: v_mul_u32_u24_sdwa
; NOSDWA-NOT: v_add_i32_sdwa ; NOSDWA-NOT: v_add_i32_sdwa
; SDWA-NOT: v_mul_u32_u24_sdwa ; SDWA-NOT: v_mul_u32_u24_sdwa
; SDWA-NOT: v_add_i32_sdwa ; SDWA-NOT: v_add_i32_sdwa
define amdgpu_kernel void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) { define void @mul_add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ina, i32 addrspace(1)* %inb, i1 addrspace(1)* %incond) {
entry: entry:
%a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 %a = load i32, i32 addrspace(1)* %ina, align 4
%b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 %b = load i32, i32 addrspace(1)* %inb, align 4
%cond = load i1, i1 addrspace(1)* %incond, align 4 %cond = load i1, i1 addrspace(1)* %incond, align 4
%shra = lshr i32 %a, 16
%shrb = lshr i32 %b, 16
br i1 %cond, label %mul_label, label %add_label br i1 %cond, label %mul_label, label %add_label
mul_label: mul_label:
%mul = mul <2 x i16> %a, %b %mul = mul i32 %shra, %shrb
br label %store_label br label %store_label
add_label: add_label:
%add = add <2 x i16> %a, %b %add = add i32 %shra, %shrb
br label %store_label br label %store_label
store_label: store_label:
%store = phi <2 x i16> [%mul, %mul_label], [%add, %add_label] %store = phi i32 [%mul, %mul_label], [%add, %add_label]
store <2 x i16> %store, <2 x i16> addrspace(1)* %out, align 4 store i32 %store, i32 addrspace(1)* %out, align 4
ret void ret void
} }