[MIR] Add simple PRE pass to MachineCSE

This is the second part of the commit fixing PR38917 (hoisting
partitially redundant machine instruction). Most of PRE (partitial
redundancy elimination) and CSE work is done on LLVM IR, but some of
redundancy arises during DAG legalization. Machine CSE is not enough
to deal with it. This simple PRE implementation works a little bit
intricately: it passes before CSE, looking for partitial redundancy
and transforming it to fully redundancy, anticipating that the next
CSE step will eliminate this created redundancy. If CSE doesn't
eliminate this, than created instruction will remain dead and eliminated
later by Remove Dead Machine Instructions pass.

The third part of the commit is supposed to refactor MachineCSE,
to make it more clear and to merge MachinePRE with MachineCSE,
so one need no rely on further Remove Dead pass to clear instrs
not eliminated by CSE.

First step: https://reviews.llvm.org/D54839

Fixes llvm.org/PR38917

llvm-svn: 361356
This commit is contained in:
Anton Afanasyev 2019-05-22 07:41:34 +00:00
parent 1c61471ab1
commit df00c6a54f
9 changed files with 1520 additions and 2093 deletions

View File

@ -19,6 +19,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
@ -49,6 +50,8 @@ using namespace llvm;
STATISTIC(NumCoalesces, "Number of copies coalesced");
STATISTIC(NumCSEs, "Number of common subexpression eliminated");
STATISTIC(NumPREs, "Number of partial redundant expression"
" transformed to fully redundant");
STATISTIC(NumPhysCSEs,
"Number of physreg referencing common subexpr eliminated");
STATISTIC(NumCrossBBCSEs,
@ -84,6 +87,7 @@ namespace {
void releaseMemory() override {
ScopeMap.clear();
PREMap.clear();
Exps.clear();
}
@ -98,6 +102,7 @@ namespace {
unsigned LookAheadLimit = 0;
DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap;
DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait> PREMap;
ScopedHTType VNT;
SmallVector<MachineInstr *, 64> Exps;
unsigned CurrVN = 0;
@ -116,13 +121,17 @@ namespace {
PhysDefVector &PhysDefs, bool &NonLocal) const;
bool isCSECandidate(MachineInstr *MI);
bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
MachineInstr *CSMI, MachineInstr *MI);
MachineBasicBlock *CSBB, MachineInstr *MI);
void EnterScope(MachineBasicBlock *MBB);
void ExitScope(MachineBasicBlock *MBB);
bool ProcessBlock(MachineBasicBlock *MBB);
bool ProcessBlockCSE(MachineBasicBlock *MBB);
void ExitScopeIfDone(MachineDomTreeNode *Node,
DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
bool PerformCSE(MachineDomTreeNode *Node);
bool isPRECandidate(MachineInstr *MI);
bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
bool PerformSimplePRE(MachineDominatorTree *DT);
};
} // end anonymous namespace
@ -405,9 +414,10 @@ bool MachineCSE::isCSECandidate(MachineInstr *MI) {
}
/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
/// common expression that defines Reg.
/// common expression that defines Reg. CSBB is basic block where CSReg is
/// defined.
bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
MachineInstr *CSMI, MachineInstr *MI) {
MachineBasicBlock *CSBB, MachineInstr *MI) {
// FIXME: Heuristics that works around the lack the live range splitting.
// If CSReg is used at all uses of Reg, CSE should not increase register
@ -433,7 +443,6 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
// an immediate predecessor. We don't want to increase register pressure and
// end up causing other computation to be spilled.
if (TII->isAsCheapAsAMove(*MI)) {
MachineBasicBlock *CSBB = CSMI->getParent();
MachineBasicBlock *BB = MI->getParent();
if (CSBB != BB && !CSBB->isSuccessor(BB))
return false;
@ -488,7 +497,7 @@ void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
ScopeMap.erase(SI);
}
bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
bool Changed = false;
SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
@ -598,7 +607,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
TargetRegisterInfo::isVirtualRegister(NewReg) &&
"Do not CSE physical register defs!");
if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) {
LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
DoCSE = false;
break;
@ -738,7 +747,7 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
for (MachineDomTreeNode *Node : Scopes) {
MachineBasicBlock *MBB = Node->getBlock();
EnterScope(MBB);
Changed |= ProcessBlock(MBB);
Changed |= ProcessBlockCSE(MBB);
// If it's a leaf node, it's done. Traverse upwards to pop ancestors.
ExitScopeIfDone(Node, OpenChildren);
}
@ -746,6 +755,98 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
return Changed;
}
// We use stronger checks for PRE candidate rather than for CSE ones to embrace
// checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps
// to exclude instrs created by PRE that won't be CSEed later.
bool MachineCSE::isPRECandidate(MachineInstr *MI) {
if (!isCSECandidate(MI) ||
MI->isNotDuplicable() ||
MI->isAsCheapAsAMove() ||
MI->getNumDefs() != 1 ||
MI->getNumExplicitDefs() != 1)
return false;
for (auto def: MI->defs())
if (!TRI->isVirtualRegister(def.getReg()))
return false;
for (auto use: MI->uses())
if (use.isReg() && !TRI->isVirtualRegister(use.getReg()))
return false;
return true;
}
bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT, MachineBasicBlock *MBB) {
bool Changed = false;
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
MachineInstr *MI = &*I;
++I;
if (!isPRECandidate(MI))
continue;
if (!PREMap.count(MI)) {
PREMap[MI] = MBB;
continue;
}
auto MBB1 = PREMap[MI];
assert(!DT->properlyDominates(MBB, MBB1) &&
"MBB cannot properly dominate MBB1 while DFS through dominators tree!");
auto CMBB = DT->findNearestCommonDominator(MBB, MBB1);
// Two instrs are partial redundant if their basic blocks are reachable
// from one to another but one doesn't dominate another.
if (CMBB != MBB1) {
auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock();
if (BB != nullptr && BB1 != nullptr &&
(isPotentiallyReachable(BB1, BB) ||
isPotentiallyReachable(BB, BB1))) {
assert(MI->getOperand(0).isDef() &&
"First operand of instr with one explicit def must be this def");
unsigned VReg = MI->getOperand(0).getReg();
unsigned NewReg = MRI->cloneVirtualRegister(VReg);
if (!isProfitableToCSE(NewReg, VReg, CMBB, MI))
continue;
MachineInstr &NewMI = TII->duplicate(*CMBB, CMBB->getFirstTerminator(), *MI);
NewMI.getOperand(0).setReg(NewReg);
PREMap[MI] = CMBB;
++NumPREs;
Changed = true;
}
}
}
return Changed;
}
// This simple PRE (partial redundancy elimination) pass doesn't actually
// eliminate partial redundancy but transforms it to full redundancy,
// anticipating that the next CSE step will eliminate this created redundancy.
// If CSE doesn't eliminate this, than created instruction will remain dead
// and eliminated later by Remove Dead Machine Instructions pass.
bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
SmallVector<MachineDomTreeNode*, 32> BBs;
PREMap.clear();
bool Changed = false;
BBs.push_back(DT->getRootNode());
do {
auto Node = BBs.pop_back_val();
const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
for (MachineDomTreeNode *Child : Children)
BBs.push_back(Child);
MachineBasicBlock *MBB = Node->getBlock();
Changed |= ProcessBlockPRE(DT, MBB);
} while (!BBs.empty());
return Changed;
}
bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@ -756,5 +857,8 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DT = &getAnalysis<MachineDominatorTree>();
LookAheadLimit = TII->getMachineCSELookAheadLimit();
return PerformCSE(DT->getRootNode());
bool ChangedPRE, ChangedCSE;
ChangedPRE = PerformSimplePRE(DT);
ChangedCSE = PerformCSE(DT->getRootNode());
return ChangedPRE || ChangedCSE;
}

View File

@ -27,8 +27,7 @@ if.then: ; preds = %entry
if.end: ; preds = %entry, %if.then
; CHECK: lw $[[R2:[0-9]+]], %got(sf2)
; CHECK: addiu ${{[0-9]+}}, $[[R2]], %lo(sf2)
; CHECK: lw $[[R3:[0-9]+]], %got(caller.sf1)
; CHECK: sw ${{[0-9]+}}, %lo(caller.sf1)($[[R3]])
; CHECK: sw ${{[0-9]+}}, %lo(caller.sf1)($[[R1]])
%tobool3 = icmp ne i32 %a0, 0
%tmp4 = load void (...)*, void (...)** @gf1, align 4
%cond = select i1 %tobool3, void (...)* %tmp4, void (...)* bitcast (void ()* @sf2 to void (...)*)

View File

@ -236,18 +236,17 @@ define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i3
; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
; NOGATHER-NEXT: .LBB4_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB4_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm3
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vmovq %xmm0, %rax
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
; NOGATHER-NEXT: .LBB4_6: # %else5
; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB4_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm2
; NOGATHER-NEXT: .LBB4_8: # %else8
@ -295,18 +294,17 @@ define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; NOGATHER-NEXT: .LBB5_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB5_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm3
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vmovq %xmm0, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; NOGATHER-NEXT: .LBB5_6: # %else5
; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB5_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; NOGATHER-NEXT: .LBB5_8: # %else8
@ -366,11 +364,11 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_4: # %else2
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
; NOGATHER-NEXT: vmovq %xmm4, %rax
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm1, %xmm4
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_6: # %else5
@ -378,7 +376,6 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm3
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
@ -402,11 +399,11 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_12: # %else14
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_14
; NOGATHER-NEXT: # %bb.13: # %cond.load16
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@ -415,8 +412,7 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_16
; NOGATHER-NEXT: # %bb.15: # %cond.load19
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
@ -477,11 +473,11 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_4: # %else2
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
; NOGATHER-NEXT: vmovq %xmm4, %rax
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_6: # %else5
@ -489,7 +485,6 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],mem[0]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
@ -514,11 +509,11 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_12: # %else14
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_14
; NOGATHER-NEXT: # %bb.13: # %cond.load16
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@ -527,8 +522,7 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_16
; NOGATHER-NEXT: # %bb.15: # %cond.load19
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
@ -583,11 +577,11 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB8_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB8_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm3, %xmm3
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@ -596,8 +590,7 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB8_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
@ -652,11 +645,11 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks
; NOGATHER-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
; NOGATHER-NEXT: .LBB9_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB9_6
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
; NOGATHER-NEXT: vmovq %xmm3, %rax
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@ -665,8 +658,7 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB9_8
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1

File diff suppressed because it is too large Load Diff

View File

@ -70,17 +70,16 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpextrb $8, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: je .LBB0_6
; AVX1-NEXT: # %bb.5: # %cond.load4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovq %xmm3, %rax
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX1-NEXT: .LBB0_6: # %else5
; AVX1-NEXT: vpextrb $12, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB0_8
; AVX1-NEXT: # %bb.7: # %cond.load7
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX1-NEXT: .LBB0_8: # %else8
@ -111,17 +110,16 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: je .LBB0_6
; AVX2-NEXT: # %bb.5: # %cond.load4
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX2-NEXT: .LBB0_6: # %else5
; AVX2-NEXT: vpextrb $12, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB0_8
; AVX2-NEXT: # %bb.7: # %cond.load7
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX2-NEXT: .LBB0_8: # %else8
@ -227,17 +225,16 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpextrb $8, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: je .LBB1_6
; AVX1-NEXT: # %bb.5: # %cond.load4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovq %xmm3, %rax
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX1-NEXT: .LBB1_6: # %else5
; AVX1-NEXT: vpextrb $12, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB1_8
; AVX1-NEXT: # %bb.7: # %cond.load7
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX1-NEXT: .LBB1_8: # %else8
@ -273,17 +270,16 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: je .LBB1_6
; AVX2-NEXT: # %bb.5: # %cond.load4
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX2-NEXT: .LBB1_6: # %else5
; AVX2-NEXT: vpextrb $12, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB1_8
; AVX2-NEXT: # %bb.7: # %cond.load7
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX2-NEXT: .LBB1_8: # %else8
@ -388,17 +384,16 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpextrb $8, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: je .LBB2_6
; AVX1-NEXT: # %bb.5: # %cond.load4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovq %xmm3, %rax
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX1-NEXT: .LBB2_6: # %else5
; AVX1-NEXT: vpextrb $12, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB2_8
; AVX1-NEXT: # %bb.7: # %cond.load7
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX1-NEXT: .LBB2_8: # %else8
@ -433,17 +428,16 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: je .LBB2_6
; AVX2-NEXT: # %bb.5: # %cond.load4
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
; AVX2-NEXT: .LBB2_6: # %else5
; AVX2-NEXT: vpextrb $12, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB2_8
; AVX2-NEXT: # %bb.7: # %cond.load7
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
; AVX2-NEXT: .LBB2_8: # %else8
@ -662,15 +656,15 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vpinsrb $1, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_4: # %else2
; AVX1-NEXT: vpmovsxdq %xmm7, %xmm6
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm8
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7
; AVX1-NEXT: vpextrb $2, %xmm7, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: je .LBB3_6
; AVX1-NEXT: # %bb.5: # %cond.load4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vmovq %xmm5, %rax
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_6: # %else5
; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6
@ -678,11 +672,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB3_8
; AVX1-NEXT: # %bb.7: # %cond.load7
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_8: # %else8
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
; AVX1-NEXT: vpextrb $4, %xmm5, %eax
@ -702,15 +695,15 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vpinsrb $5, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_12: # %else14
; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6
; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm8
; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm5
; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7
; AVX1-NEXT: vpextrb $6, %xmm7, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: je .LBB3_14
; AVX1-NEXT: # %bb.13: # %cond.load16
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vmovq %xmm5, %rax
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_14: # %else17
; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6
@ -718,12 +711,11 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB3_16
; AVX1-NEXT: # %bb.15: # %cond.load19
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_16: # %else20
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
; AVX1-NEXT: vpextrb $8, %xmm5, %eax
@ -748,10 +740,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6
; AVX1-NEXT: vpextrb $10, %xmm6, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: je .LBB3_22
; AVX1-NEXT: # %bb.21: # %cond.load28
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
; AVX1-NEXT: vmovq %xmm7, %rax
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_22: # %else29
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
@ -759,7 +751,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB3_24
; AVX1-NEXT: # %bb.23: # %cond.load31
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_24: # %else32
@ -784,17 +775,16 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpextrb $14, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: je .LBB3_30
; AVX1-NEXT: # %bb.29: # %cond.load40
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_30: # %else41
; AVX1-NEXT: vpextrb $15, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je .LBB3_32
; AVX1-NEXT: # %bb.31: # %cond.load43
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3
; AVX1-NEXT: .LBB3_32: # %else44
@ -829,10 +819,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX2-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6
; AVX2-NEXT: vpextrb $2, %xmm6, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX2-NEXT: je .LBB3_6
; AVX2-NEXT: # %bb.5: # %cond.load4
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
; AVX2-NEXT: vmovq %xmm7, %rax
; AVX2-NEXT: vmovq %xmm5, %rax
; AVX2-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_6: # %else5
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
@ -840,7 +830,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB3_8
; AVX2-NEXT: # %bb.7: # %cond.load7
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX2-NEXT: vpextrq $1, %xmm5, %rax
; AVX2-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_8: # %else8
@ -865,10 +854,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpextrb $6, %xmm5, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: je .LBB3_14
; AVX2-NEXT: # %bb.13: # %cond.load16
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX2-NEXT: vmovq %xmm6, %rax
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_14: # %else17
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm6
@ -876,7 +865,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB3_16
; AVX2-NEXT: # %bb.15: # %cond.load19
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_16: # %else20
@ -902,10 +890,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
; AVX2-NEXT: vpextrb $10, %xmm5, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: je .LBB3_22
; AVX2-NEXT: # %bb.21: # %cond.load28
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX2-NEXT: vmovq %xmm6, %rax
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_22: # %else29
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
@ -913,7 +901,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB3_24
; AVX2-NEXT: # %bb.23: # %cond.load31
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_24: # %else32
@ -938,17 +925,16 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpextrb $14, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: je .LBB3_30
; AVX2-NEXT: # %bb.29: # %cond.load40
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_30: # %else41
; AVX2-NEXT: vpextrb $15, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je .LBB3_32
; AVX2-NEXT: # %bb.31: # %cond.load43
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3
; AVX2-NEXT: .LBB3_32: # %else44
@ -1009,9 +995,9 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: kshiftrw $4, %k0, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5
; AVX512-NEXT: je .LBB3_10
; AVX512-NEXT: # %bb.9: # %cond.load10
; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5
; AVX512-NEXT: vmovq %xmm5, %rax
; AVX512-NEXT: vpinsrb $4, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_10: # %else11
@ -1020,7 +1006,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: je .LBB3_12
; AVX512-NEXT: # %bb.11: # %cond.load13
; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5
; AVX512-NEXT: vpextrq $1, %xmm5, %rax
; AVX512-NEXT: vpinsrb $5, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_12: # %else14
@ -1032,10 +1017,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: kshiftrw $6, %k0, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm4
; AVX512-NEXT: je .LBB3_14
; AVX512-NEXT: # %bb.13: # %cond.load16
; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm5
; AVX512-NEXT: vmovq %xmm5, %rax
; AVX512-NEXT: vmovq %xmm4, %rax
; AVX512-NEXT: vpinsrb $6, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_14: # %else17
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
@ -1044,7 +1029,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: je .LBB3_16
; AVX512-NEXT: # %bb.15: # %cond.load19
; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm4
; AVX512-NEXT: vpextrq $1, %xmm4, %rax
; AVX512-NEXT: vpinsrb $7, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_16: # %else20
@ -1098,9 +1082,9 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: kshiftrw $12, %k0, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: je .LBB3_26
; AVX512-NEXT: # %bb.25: # %cond.load34
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vmovq %xmm3, %rax
; AVX512-NEXT: vpinsrb $12, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_26: # %else35
@ -1109,7 +1093,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: je .LBB3_28
; AVX512-NEXT: # %bb.27: # %cond.load37
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
; AVX512-NEXT: vpinsrb $13, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_28: # %else38
@ -1120,10 +1103,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: kshiftrw $14, %k0, %k1
; AVX512-NEXT: kmovw %k1, %eax
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: je .LBB3_30
; AVX512-NEXT: # %bb.29: # %cond.load40
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vpinsrb $14, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_30: # %else41
; AVX512-NEXT: kshiftrw $15, %k0, %k0
@ -1131,7 +1114,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
; AVX512-NEXT: testb $1, %al
; AVX512-NEXT: je .LBB3_32
; AVX512-NEXT: # %bb.31: # %cond.load43
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vpinsrb $15, (%rax), %xmm2, %xmm2
; AVX512-NEXT: .LBB3_32: # %else44

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff