From 5fddf091879e195d1f3ac83e8384ec1ff2882c23 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 29 Mar 2019 03:54:56 +0000 Subject: [PATCH] AMDGPU/GlobalISel: Insert waterfall loop for vector indexing The register index can only really be an SGPR. Lie that a VGPR index is legal, and then rewrite the instruction in a waterfall loop to handle the index. llvm-svn: 357235 --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 170 ++++++++++++++++++ .../Target/AMDGPU/AMDGPURegisterBankInfo.h | 4 + .../regbankselect-extract-vector-elt.mir | 103 +++++++++-- 3 files changed, 265 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 42742a649b36..24e0a03667d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -13,9 +13,11 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -328,6 +330,170 @@ static LLT getHalfSizedType(LLT Ty) { return LLT::scalar(Ty.getSizeInBits() / 2); } +/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If +/// any of the required SGPR operands are VGPRs, perform a waterfall loop to +/// execute the instruction for each unique combination of values in all lanes +/// in the wave. The block will be split such that new blocks +void AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef OpIndices) const { + MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator I(MI); + + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + assert(OpIndices.size() == 1 && + "need to implement support for multiple operands"); + + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet SGPROperandRegs; + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + unsigned Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + if (SGPROperandRegs.empty()) + return; + + MachineIRBuilder B(MI); + SmallVector ResultRegs; + SmallVector InitResultRegs; + SmallVector PhiRegs; + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + unsigned InitReg = B.buildUndef(ResTy).getReg(0); + unsigned PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } + + unsigned SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // Don't bother using generic instructions/registers for the exec mask. + B.buildInstr(TargetOpcode::IMPLICIT_DEF) + .addDef(InitSaveExecReg); + + // Save the EXEC mask + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) + .addReg(AMDGPU::EXEC); + + unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + // To insert the loop we need to split the block. Move everything before this + // point to a new block, and insert a new empty block before this instruction. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RestoreExecBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(RestoreExecBB); + LoopBB->addSuccessor(LoopBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + RestoreExecBB->addSuccessor(RemainderBB); + + B.setInsertPt(*LoopBB, LoopBB->end()); + + B.buildInstr(TargetOpcode::PHI) + .addDef(PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&MBB) + .addReg(NewExec) + .addMBB(LoopBB); + + for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { + B.buildInstr(TargetOpcode::G_PHI) + .addDef(std::get<2>(Result)) + .addReg(std::get<0>(Result)) // Initial value / implicit_def + .addMBB(&MBB) + .addReg(std::get<1>(Result)) // Mid-loop value. + .addMBB(LoopBB); + } + + // Move the instruction into the loop. + LoopBB->splice(LoopBB->end(), &MBB, I); + I = std::prev(LoopBB->end()); + + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg()) + continue; + + assert(!Op.isDef()); + if (SGPROperandRegs.count(Op.getReg())) { + unsigned CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, LLT::scalar(32)); // FIXME + + assert(MRI.getType(Op.getReg())== LLT::scalar(32) && + "need to implement support for other types"); + + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); + + // FIXME: Need to and each conditon + + // Compare the just read SGPR value to all possible operand values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(CondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); + } + } + + // Update EXEC, save the original EXEC value to VCC. + B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) + .addDef(NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + B.buildInstr(AMDGPU::S_XOR_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(NewExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) + .addMBB(LoopBB); + + // Restore the EXEC mask + B.buildInstr(AMDGPU::S_MOV_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(SaveExecReg); +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -436,6 +602,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } + case AMDGPU::G_EXTRACT_VECTOR_ELT: + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; default: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 18c15cd5efe7..6000943b3aaf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -37,6 +37,10 @@ protected: class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const SIRegisterInfo *TRI; + void executeInWaterfallLoop(MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef OpIndices) const; + /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir index b7dadd8367f3..4a2f9f6780df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -3,13 +3,15 @@ # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s --- -name: extract_vector_elt_v16i32_ss +name: extract_vector_elt_v16s32_ss legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 - ; CHECK-LABEL: name: extract_vector_elt_v16i32_ss + ; CHECK-LABEL: name: extract_vector_elt_v16s32_ss + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 ; CHECK: [[EVEC:%[0-9]+]]:sgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) @@ -21,17 +23,36 @@ body: | ... --- -name: extract_vector_elt_v16i32_sv +name: extract_vector_elt_v16s32_sv legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 - ; CHECK-LABEL: name: extract_vector_elt_v16i32_sv + ; CHECK-LABEL: name: extract_vector_elt_v16s32_sv + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<16 x s32>) = COPY [[COPY]](<16 x s32>) - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<16 x s32>), [[COPY1]](s32) + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: .3: ; CHECK: $vgpr0 = COPY [[EVEC]](s32) %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 @@ -40,13 +61,15 @@ body: | ... --- -name: extract_vector_elt_v16i32_vs +name: extract_vector_elt_v16s32_vs legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 - ; CHECK-LABEL: name: extract_vector_elt_v16i32_vs + ; CHECK-LABEL: name: extract_vector_elt_v16s32_vs + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) @@ -58,19 +81,75 @@ body: | ... --- -name: extract_vector_elt_v16i32_vv +name: extract_vector_elt_v16s32_vv legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 - ; CHECK-LABEL: name: extract_vector_elt_v16i32_vv + ; CHECK-LABEL: name: extract_vector_elt_v16s32_vv + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: .3: ; CHECK: $vgpr0 = COPY [[EVEC]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1 $vgpr0 = COPY %2 ... + +--- +name: extract_vector_elt_v8s64_vv +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 + ; CHECK-LABEL: name: extract_vector_elt_v8s64_vv + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s64) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s64) = G_PHI [[DEF]](s64), %bb.0, %2(s64), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1 = COPY [[EVEC]](s64) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s32) = COPY $vgpr16 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $vgpr0_vgpr1 = COPY %2 +...