Remove a fixed item, update a couple partially-fixed items.

llvm-svn: 105375
This commit is contained in:
Eli Friedman 2010-06-03 01:01:48 +00:00
parent a2f74f3c98
commit 1f41303260
1 changed files with 22 additions and 81 deletions

View File

@ -1103,57 +1103,6 @@ be folded into: shl [mem], 1
//===---------------------------------------------------------------------===//
This testcase misses a read/modify/write opportunity (from PR1425):
void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
int i;
for(i=0; i<width; i++)
b1[i] += (1*(b0[i] + b2[i])+0)>>0;
}
We compile it down to:
LBB1_2: # bb
movl (%esi,%edi,4), %ebx
addl (%ecx,%edi,4), %ebx
addl (%edx,%edi,4), %ebx
movl %ebx, (%ecx,%edi,4)
incl %edi
cmpl %eax, %edi
jne LBB1_2 # bb
the inner loop should add to the memory location (%ecx,%edi,4), saving
a mov. Something like:
movl (%esi,%edi,4), %ebx
addl (%edx,%edi,4), %ebx
addl %ebx, (%ecx,%edi,4)
Here is another interesting example:
void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
int i;
for(i=0; i<width; i++)
b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
}
We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
LBB9_2: # bb
movl (%ecx,%edi,4), %ebx
subl (%esi,%edi,4), %ebx
subl (%edx,%edi,4), %ebx
movl %ebx, (%ecx,%edi,4)
incl %edi
cmpl %eax, %edi
jne LBB9_2 # bb
Additionally, LSR should rewrite the exit condition of these loops to use
a stride-4 IV, would would allow all the scales in the loop to go away.
This would result in smaller code and more efficient microops.
//===---------------------------------------------------------------------===//
In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
or and instruction, for example:
@ -1301,15 +1250,8 @@ FirstOnet:
xorl %eax, %eax
ret
There are a few possible improvements here:
1. We should be able to eliminate the dead load into %ecx
2. We could change the "movl 8(%esp), %eax" into
"movzwl 10(%esp), %eax"; this lets us change the cmpl
into a testl, which is shorter, and eliminate the shift.
We could also in theory eliminate the branch by using a conditional
for the address of the load, but that seems unlikely to be worthwhile
in general.
We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
//===---------------------------------------------------------------------===//
@ -1331,22 +1273,23 @@ bb7: ; preds = %entry
to:
_foo:
foo: # @foo
# BB#0: # %entry
movl 4(%esp), %ecx
cmpb $0, 16(%esp)
movl 12(%esp), %ecx
je .LBB0_2
# BB#1: # %bb
movl 8(%esp), %eax
movl 4(%esp), %edx
je LBB1_2 # bb7
LBB1_1: # bb
addl %edx, %eax
addl %ecx, %eax
ret
LBB1_2: # bb7
movl %edx, %eax
subl %ecx, %eax
.LBB0_2: # %bb7
movl 12(%esp), %edx
movl %ecx, %eax
subl %edx, %eax
ret
The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
if it commuted the addl in LBB1_1.
There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
couple more movls by putting 4(%esp) into %eax instead of %ecx.
//===---------------------------------------------------------------------===//
@ -1396,8 +1339,7 @@ Also check why xmm7 is not used at all in the function.
//===---------------------------------------------------------------------===//
Legalize loses track of the fact that bools are always zero extended when in
memory. This causes us to compile abort_gzip (from 164.gzip) from:
Take the following:
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin8"
@ -1416,16 +1358,15 @@ bb4.i: ; preds = %entry
}
declare void @exit(i32) noreturn nounwind
into:
_abort_gzip:
This compiles into:
_abort_gzip: ## @abort_gzip
## BB#0: ## %entry
subl $12, %esp
movb _in_exit.4870.b, %al
notb %al
testb $1, %al
jne LBB1_2 ## bb4.i
LBB1_1: ## bb.i
...
cmpb $1, %al
jne LBB0_2
We somehow miss folding the movb into the cmpb.
//===---------------------------------------------------------------------===//