Allow some reloads to be folded in multi-use cases. Specifically testl r, r -> cmpl [mem], 0.

llvm-svn: 44479
This commit is contained in:
Evan Cheng 2007-12-01 02:07:52 +00:00
parent e62b441b51
commit 69fda0a716
13 changed files with 259 additions and 32 deletions

View File

@ -275,7 +275,8 @@ namespace llvm {
/// returns true.
bool tryFoldMemoryOperand(MachineInstr* &MI, VirtRegMap &vrm,
MachineInstr *DefMI, unsigned InstrIdx,
unsigned OpIdx, unsigned NumUses,
unsigned OpIdx,
SmallVector<unsigned, 2> &UseOps,
bool isSS, int Slot, unsigned Reg);
/// anyKillInMBBAfterIdx - Returns true if there is a kill of the specified

View File

@ -543,6 +543,14 @@ public:
return 0;
}
/// foldMemoryOperand - Same as previous except it tries to fold instruction
/// with multiple uses of the same register.
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const {
return 0;
}
/// foldMemoryOperand - Same as the previous version except it allows folding
/// of any load and store from / to any address, not just from a specific
/// stack slot.
@ -552,6 +560,14 @@ public:
return 0;
}
/// foldMemoryOperand - Same as previous except it tries to fold instruction
/// with multiple uses of the same register.
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr* LoadMI) const {
return 0;
}
/// getOpcodeAfterMemoryFold - Returns the opcode of the would be new
/// instruction after load / store is folded into an instruction of the
/// specified opcode. It returns zero if the specified unfolding is not

View File

@ -644,20 +644,27 @@ bool LiveIntervals::isReMaterializable(const LiveInterval &li,
bool LiveIntervals::tryFoldMemoryOperand(MachineInstr* &MI,
VirtRegMap &vrm, MachineInstr *DefMI,
unsigned InstrIdx, unsigned OpIdx,
unsigned NumUses,
SmallVector<unsigned, 2> &UseOps,
bool isSS, int Slot, unsigned Reg) {
// FIXME: fold subreg use
if (MI->getOperand(OpIdx).getSubReg())
return false;
// FIXME: It may be possible to fold load when there are multiple uses.
// e.g. On x86, TEST32rr r, r -> CMP32rm [mem], 0
if (NumUses > 1)
return false;
MachineInstr *fmi = NULL;
if (UseOps.size() < 2)
fmi = isSS ? mri_->foldMemoryOperand(MI, OpIdx, Slot)
: mri_->foldMemoryOperand(MI, OpIdx, DefMI);
else {
if (OpIdx != UseOps[0])
// Must be two-address instruction + one more use. Not going to fold.
return false;
// It may be possible to fold load when there are multiple uses.
// e.g. On x86, TEST32rr r, r -> CMP32rm [mem], 0
fmi = isSS ? mri_->foldMemoryOperand(MI, UseOps, Slot)
: mri_->foldMemoryOperand(MI, UseOps, DefMI);
}
MachineInstr *fmi = isSS
? mri_->foldMemoryOperand(MI, OpIdx, Slot)
: mri_->foldMemoryOperand(MI, OpIdx, DefMI);
if (fmi) {
// Attempt to fold the memory reference into the instruction. If
// we can do this, we don't need to insert spill code.
@ -768,7 +775,9 @@ rewriteInstructionForSpills(const LiveInterval &li, bool TrySplit,
HasUse = mop.isUse();
HasDef = mop.isDef();
unsigned NumUses = HasUse;
SmallVector<unsigned, 2> UseOps;
if (HasUse)
UseOps.push_back(i);
std::vector<unsigned> UpdateOps;
for (unsigned j = i+1, e = MI->getNumOperands(); j != e; ++j) {
if (!MI->getOperand(j).isRegister())
@ -779,7 +788,7 @@ rewriteInstructionForSpills(const LiveInterval &li, bool TrySplit,
if (RegJ == RegI) {
UpdateOps.push_back(j);
if (MI->getOperand(j).isUse())
++NumUses;
UseOps.push_back(j);
HasUse |= MI->getOperand(j).isUse();
HasDef |= MI->getOperand(j).isDef();
}
@ -787,7 +796,7 @@ rewriteInstructionForSpills(const LiveInterval &li, bool TrySplit,
if (TryFold &&
tryFoldMemoryOperand(MI, vrm, ReMatDefMI, index, i,
NumUses, FoldSS, FoldSlot, Reg)) {
UseOps, FoldSS, FoldSlot, Reg)) {
// Folding the load/store can completely change the instruction in
// unpredictable ways, rescan it from the beginning.
HasUse = false;
@ -1207,6 +1216,7 @@ addIntervalsForSpills(const LiveInterval &li,
if (!TrySplit)
return NewLIs;
SmallVector<unsigned, 2> UseOps;
if (NeedStackSlot) {
int Id = SpillMBBs.find_first();
while (Id != -1) {
@ -1217,7 +1227,7 @@ addIntervalsForSpills(const LiveInterval &li,
bool isReMat = vrm.isReMaterialized(VReg);
MachineInstr *MI = getInstructionFromIndex(index);
int OpIdx = -1;
unsigned NumUses = 0;
UseOps.clear();
if (spills[i].canFold) {
for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
MachineOperand &MO = MI->getOperand(j);
@ -1230,20 +1240,20 @@ addIntervalsForSpills(const LiveInterval &li,
// Can't fold if it's two-address code and the use isn't the
// first and only use.
if (isReMat ||
(NumUses == 0 && !alsoFoldARestore(Id, index, VReg, RestoreMBBs,
RestoreIdxes))) {
(UseOps.empty() && !alsoFoldARestore(Id, index, VReg,
RestoreMBBs, RestoreIdxes))) {
OpIdx = -1;
break;
}
++NumUses;
UseOps.push_back(j);
}
}
// Fold the store into the def if possible.
bool Folded = false;
if (OpIdx != -1) {
if (tryFoldMemoryOperand(MI, vrm, NULL, index, OpIdx, NumUses,
if (tryFoldMemoryOperand(MI, vrm, NULL, index, OpIdx, UseOps,
true, Slot, VReg)) {
if (NumUses)
if (!UseOps.empty())
// Folded a two-address instruction, do not issue a load.
eraseRestoreInfo(Id, index, VReg, RestoreMBBs, RestoreIdxes);
Folded = true;
@ -1267,8 +1277,8 @@ addIntervalsForSpills(const LiveInterval &li,
continue;
unsigned VReg = restores[i].vreg;
MachineInstr *MI = getInstructionFromIndex(index);
unsigned NumUses = 0;
int OpIdx = -1;
UseOps.clear();
if (restores[i].canFold) {
for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
MachineOperand &MO = MI->getOperand(j);
@ -1280,10 +1290,10 @@ addIntervalsForSpills(const LiveInterval &li,
OpIdx = -1;
break;
}
if (NumUses == 0)
if (UseOps.empty())
// Use the first use index.
OpIdx = (int)j;
++NumUses;
UseOps.push_back(j);
}
}
@ -1298,9 +1308,9 @@ addIntervalsForSpills(const LiveInterval &li,
if (isLoadSS ||
(ReMatDefMI->getInstrDescriptor()->Flags & M_LOAD_FLAG))
Folded = tryFoldMemoryOperand(MI, vrm, ReMatDefMI, index, OpIdx,
NumUses, isLoadSS, LdSlot, VReg);
UseOps, isLoadSS, LdSlot, VReg);
} else
Folded = tryFoldMemoryOperand(MI, vrm, NULL, index, OpIdx, NumUses,
Folded = tryFoldMemoryOperand(MI, vrm, NULL, index, OpIdx, UseOps,
true, Slot, VReg);
}
// If folding is not possible / failed, then tell the spiller to issue a

View File

@ -77,11 +77,23 @@ public:
MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
int FrameIndex) const;
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const {
return 0;
}
MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
MachineInstr* LoadMI) const {
return 0;
}
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr* LoadMI) const {
return 0;
}
const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
const TargetRegisterClass* const*

View File

@ -51,11 +51,23 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
MachineInstr* foldMemoryOperand(MachineInstr *MI, unsigned OpNum,
int FrameIndex) const;
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const {
return 0;
}
MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
MachineInstr* LoadMI) const {
return 0;
}
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr* LoadMI) const {
return 0;
}
void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned DestReg, unsigned SrcReg,
const TargetRegisterClass *DestRC,

View File

@ -203,12 +203,6 @@ foldMemoryOperand(MachineInstr* MI, unsigned OpNum, int FI) const
return NewMI;
}
MachineInstr *MipsRegisterInfo::
foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
MachineInstr* LoadMI) const {
return NULL;
}
//===----------------------------------------------------------------------===//
//
// Callee Saved Registers methods

View File

@ -58,8 +58,22 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
int FrameIndex) const;
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const {
return 0;
}
MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
MachineInstr* LoadMI) const;
MachineInstr* LoadMI) const {
return 0;
}
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr* LoadMI) const {
return 0;
}
void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned DestReg, unsigned SrcReg,

View File

@ -68,11 +68,23 @@ public:
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
int FrameIndex) const;
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const {
return 0;
}
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
MachineInstr* LoadMI) const {
return 0;
}
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr* LoadMI) const {
return 0;
}
const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
const TargetRegisterClass* const*

View File

@ -62,12 +62,24 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
unsigned OpNum,
int FrameIndex) const;
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const {
return 0;
}
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
unsigned OpNum,
MachineInstr* LoadMI) const {
return 0;
}
virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr* LoadMI) const {
return 0;
}
const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
const TargetRegisterClass* const* getCalleeSavedRegClasses(

View File

@ -1149,6 +1149,31 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI, unsigned OpNu
return foldMemoryOperand(MI, OpNum, MOs);
}
MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const {
// Check switch flag
if (NoFusing) return NULL;
if (UseOps.size() == 1)
return foldMemoryOperand(MI, UseOps[0], FrameIndex);
else if (UseOps.size() != 2 || UseOps[0] != 0 && UseOps[1] != 1)
return NULL;
unsigned NewOpc = 0;
switch (MI->getOpcode()) {
default: return NULL;
case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
}
// Change to CMPXXri r, 0 first.
MI->setInstrDescriptor(TII.get(NewOpc));
MI->getOperand(1).ChangeToImmediate(0);
return foldMemoryOperand(MI, 0, FrameIndex);
}
MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI, unsigned OpNum,
MachineInstr *LoadMI) const {
// Check switch flag
@ -1160,6 +1185,31 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI, unsigned OpNu
return foldMemoryOperand(MI, OpNum, MOs);
}
MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr *LoadMI) const {
// Check switch flag
if (NoFusing) return NULL;
if (UseOps.size() == 1)
return foldMemoryOperand(MI, UseOps[0], LoadMI);
else if (UseOps.size() != 2 || UseOps[0] != 0 && UseOps[1] != 1)
return NULL;
unsigned NewOpc = 0;
switch (MI->getOpcode()) {
default: return NULL;
case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
}
// Change to CMPXXri r, 0 first.
MI->setInstrDescriptor(TII.get(NewOpc));
MI->getOperand(1).ChangeToImmediate(0);
return foldMemoryOperand(MI, 0, LoadMI);
}
unsigned X86RegisterInfo::getOpcodeAfterMemoryFold(unsigned Opc,
unsigned OpNum) const {
// Check switch flag
@ -1270,7 +1320,30 @@ bool X86RegisterInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
MachineOperand &MO = ImpOps[i];
MIB.addReg(MO.getReg(), MO.isDef(), true, MO.isKill(), MO.isDead());
}
NewMIs.push_back(MIB);
// Change CMP32ri r, 0 back to TEST32rr r, r, etc.
unsigned NewOpc = 0;
switch (DataMI->getOpcode()) {
default: break;
case X86::CMP64ri32:
case X86::CMP32ri:
case X86::CMP16ri:
case X86::CMP8ri: {
MachineOperand &MO0 = DataMI->getOperand(0);
MachineOperand &MO1 = DataMI->getOperand(1);
if (MO1.getImm() == 0) {
switch (DataMI->getOpcode()) {
default: break;
case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
}
DataMI->setInstrDescriptor(TII.get(NewOpc));
MO1.ChangeToRegister(MO0.getReg(), false);
}
}
}
NewMIs.push_back(DataMI);
// Emit the store instruction.
if (UnfoldStore) {

View File

@ -141,6 +141,12 @@ public:
unsigned OpNum,
int FrameIndex) const;
/// foldMemoryOperand - Same as previous except it tries to fold instruction
/// with multiple uses of the same register.
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
int FrameIndex) const;
/// foldMemoryOperand - Same as the previous version except it allows folding
/// of any load and store from / to any address, not just from a specific
/// stack slot.
@ -148,6 +154,13 @@ public:
unsigned OpNum,
MachineInstr* LoadMI) const;
/// foldMemoryOperand - Same as the previous version except it allows folding
/// of any load and store from / to any address, not just from a specific
/// stack slot.
MachineInstr* foldMemoryOperand(MachineInstr* MI,
SmallVectorImpl<unsigned> &UseOps,
MachineInstr* LoadMI) const;
/// getOpcodeAfterMemoryFold - Returns the opcode of the would be new
/// instruction after load / store is folded into an instruction of the
/// specified opcode. It returns zero if the specified unfolding is not

View File

@ -1,4 +1,4 @@
; RUN: llvm-as < %s | llc -mtriple=i686-apple-darwin | grep "48(%esp)" | count 5
; RUN: llvm-as < %s | llc -mtriple=i686-apple-darwin | grep "48(%esp)" | count 6
%struct..0anon = type { i32 }
%struct.rtvec_def = type { i32, [1 x %struct..0anon] }

View File

@ -0,0 +1,58 @@
; RUN: llvm-as < %s | llc -march=x86 -stats |& \
; RUN: grep {2 .*folded into instructions}
; RUN: llvm-as < %s | llc -march=x86 | grep cmp | count 3
%struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* }
define fastcc i32 @perimeter(%struct.quad_struct* %tree, i32 %size) {
entry:
%tree.idx7.val = load %struct.quad_struct** null ; <%struct.quad_struct*> [#uses=1]
%tmp8.i51 = icmp eq %struct.quad_struct* %tree.idx7.val, null ; <i1> [#uses=2]
br i1 %tmp8.i51, label %cond_next, label %cond_next.i52
cond_next.i52: ; preds = %entry
ret i32 0
cond_next: ; preds = %entry
%tmp59 = load i32* null, align 4 ; <i32> [#uses=1]
%tmp70 = icmp eq i32 %tmp59, 2 ; <i1> [#uses=1]
br i1 %tmp70, label %cond_true.i35, label %bb80
cond_true.i35: ; preds = %cond_next
%tmp14.i.i37 = load %struct.quad_struct** null, align 4 ; <%struct.quad_struct*> [#uses=1]
%tmp3.i160 = load i32* null, align 4 ; <i32> [#uses=1]
%tmp4.i161 = icmp eq i32 %tmp3.i160, 2 ; <i1> [#uses=1]
br i1 %tmp4.i161, label %cond_true.i163, label %cond_false.i178
cond_true.i163: ; preds = %cond_true.i35
%tmp7.i162 = sdiv i32 %size, 4 ; <i32> [#uses=2]
%tmp13.i168 = tail call fastcc i32 @sum_adjacent( %struct.quad_struct* null, i32 3, i32 2, i32 %tmp7.i162 ) ; <i32> [#uses=1]
%tmp18.i11.i170 = getelementptr %struct.quad_struct* %tmp14.i.i37, i32 0, i32 4 ; <%struct.quad_struct**> [#uses=1]
%tmp19.i12.i171 = load %struct.quad_struct** %tmp18.i11.i170, align 4 ; <%struct.quad_struct*> [#uses=1]
%tmp21.i173 = tail call fastcc i32 @sum_adjacent( %struct.quad_struct* %tmp19.i12.i171, i32 3, i32 2, i32 %tmp7.i162 ) ; <i32> [#uses=1]
%tmp22.i174 = add i32 %tmp21.i173, %tmp13.i168 ; <i32> [#uses=1]
br i1 false, label %cond_true.i141, label %cond_false.i156
cond_false.i178: ; preds = %cond_true.i35
ret i32 0
cond_true.i141: ; preds = %cond_true.i163
%tmp7.i140 = sdiv i32 %size, 4 ; <i32> [#uses=1]
%tmp21.i151 = tail call fastcc i32 @sum_adjacent( %struct.quad_struct* null, i32 3, i32 2, i32 %tmp7.i140 ) ; <i32> [#uses=0]
ret i32 0
cond_false.i156: ; preds = %cond_true.i163
%tmp22.i44 = add i32 0, %tmp22.i174 ; <i32> [#uses=0]
br i1 %tmp8.i51, label %bb22.i, label %cond_next.i
bb80: ; preds = %cond_next
ret i32 0
cond_next.i: ; preds = %cond_false.i156
ret i32 0
bb22.i: ; preds = %cond_false.i156
ret i32 0
}
declare fastcc i32 @sum_adjacent(%struct.quad_struct*, i32, i32, i32)