add a note.

llvm-svn: 66360
This commit is contained in:
Chris Lattner 2009-03-08 03:04:26 +00:00
parent cfd1f7aa9f
commit 393ac62837
1 changed files with 40 additions and 0 deletions

View File

@ -1840,3 +1840,43 @@ _f:
ret
//===---------------------------------------------------------------------===//
memcpy/memmove do not lower to SSE copies when possible. A silly example is:
define <16 x float> @foo(<16 x float> %A) nounwind {
%tmp = alloca <16 x float>, align 16
%tmp2 = alloca <16 x float>, align 16
store <16 x float> %A, <16 x float>* %tmp
%s = bitcast <16 x float>* %tmp to i8*
%s2 = bitcast <16 x float>* %tmp2 to i8*
call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
%R = load <16 x float>* %tmp2
ret <16 x float> %R
}
declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
which compiles to:
_foo:
subl $140, %esp
movaps %xmm3, 112(%esp)
movaps %xmm2, 96(%esp)
movaps %xmm1, 80(%esp)
movaps %xmm0, 64(%esp)
movl 60(%esp), %eax
movl %eax, 124(%esp)
movl 56(%esp), %eax
movl %eax, 120(%esp)
movl 52(%esp), %eax
<many many more 32-bit copies>
movaps (%esp), %xmm0
movaps 16(%esp), %xmm1
movaps 32(%esp), %xmm2
movaps 48(%esp), %xmm3
addl $140, %esp
ret
On Nehalem, it may even be cheaper to just use movups when unaligned than to
fall back to lower-granularity chunks.
//===---------------------------------------------------------------------===//