diff --git a/acconfig.h b/acconfig.h index 2284711b77..1023f56beb 100644 --- a/acconfig.h +++ b/acconfig.h @@ -16,6 +16,7 @@ #undef ENABLE_MP #undef ENABLE_NLS +#undef HAVE_ASM_MMX #undef HAVE_CATGETS #undef HAVE_DIRENT_H #undef HAVE_DOPRNT @@ -49,7 +50,6 @@ #undef SRAND_FUNC #undef USE_PTHREADS - /* Leave that blank line there!! Autoheader needs it. If you're adding to this file, keep in mind: diff --git a/app/Makefile.am b/app/Makefile.am index d025e54418..f7ddade842 100644 --- a/app/Makefile.am +++ b/app/Makefile.am @@ -314,7 +314,8 @@ gimp_SOURCES = \ marching_ants.h \ pixmaps.h \ pixmaps2.h \ - wilber.h + wilber.h \ + paint_funcs_simd.S EXTRA_DIST = \ makefile.mingw \ @@ -322,7 +323,8 @@ EXTRA_DIST = \ makefile.msc \ gimp.rc \ gimp.sym \ - wilber.ico + wilber.ico \ + arch/i386/mmx/paint_funcs_mmx.S gimp_LDFLAGS = -export-dynamic -export-symbols $(srcdir)/gimp.sym diff --git a/app/arch/i386/mmx/paint_funcs_mmx.S b/app/arch/i386/mmx/paint_funcs_mmx.S new file mode 100644 index 0000000000..9e92d3a608 --- /dev/null +++ b/app/arch/i386/mmx/paint_funcs_mmx.S @@ -0,0 +1,1422 @@ +/* +MMX code to supplement some functions in paint_funcs.c +for the Gimp. + +Copyright (C) 1999, 2001 David Monniaux +*/ + +.text +.align 4 + +.globl intel_cpu_features + +intel_cpu_features: + pushl %ebx + pushfl + popl %eax + xor $ 0x200000, %eax + pushl %eax + popfl + pushfl + popl %edx + xor %eax, %edx + xor %eax, %eax + test $ 0x200000, %edx + jnz .intel_cpu_features_end + movl $ 1, %eax + cpuid + movl %edx, %eax +.intel_cpu_features_end: + popl %ebx + ret + +.alpha_mask_1a: .int 0xFF00FF00, 0xFF00FF00 +.mult_shift: .int 0x00800080, 0x00800080 +.alpha_mask_3a: .int 0xFF000000, 0xFF000000 + + + +/* min(a,b) = a - max(a-b, 0) */ + +.globl add_pixels_3a_3a + +.align 16 +add_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + subl $ 2, %ecx + jl .add_pixels_3a_3a_last + movl $ 8, %ebx +.add_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + paddusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .add_pixels_3a_3a_loop +.add_pixels_3a_3a_last: + test $ 1, %ecx + jz .add_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + paddusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, (%edi) +.add_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl add_pixels_1a_1a +.align 16 +add_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .add_pixels_1a_1a_last3 + movl $ 8, %ebx +.add_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + paddusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .add_pixels_1a_1a_loop + +.add_pixels_1a_1a_last3: + test $ 2, %ecx + jz .add_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + paddusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.add_pixels_1a_1a_last1: + test $ 1, %ecx + jz .add_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + + movq %mm2, %mm4 + paddusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, %ebx + movw %bx, (%edi) + +.add_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + + +.globl substract_pixels_3a_3a + +.align 16 +substract_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + subl $ 2, %ecx + jl .substract_pixels_3a_3a_last + movl $ 8, %ebx +.substract_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .substract_pixels_3a_3a_loop +.substract_pixels_3a_3a_last: + test $ 1, %ecx + jz .substract_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, (%edi) +.substract_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl substract_pixels_1a_1a +.align 16 +substract_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .substract_pixels_1a_1a_last3 + movl $ 8, %ebx +.substract_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .substract_pixels_1a_1a_loop + +.substract_pixels_1a_1a_last3: + test $ 2, %ecx + jz .substract_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.substract_pixels_1a_1a_last1: + test $ 1, %ecx + jz .substract_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, %ebx + movw %bx, (%edi) + +.substract_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + + +.globl difference_pixels_3a_3a + +.align 16 +difference_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + subl $ 2, %ecx + jl .difference_pixels_3a_3a_last + movl $ 8, %ebx +.difference_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + movq %mm3, %mm5 + psubusb %mm3, %mm4 + psubusb %mm2, %mm5 + movq %mm0, %mm1 + paddb %mm5, %mm4 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .difference_pixels_3a_3a_loop +.difference_pixels_3a_3a_last: + test $ 1, %ecx + jz .difference_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + movq %mm3, %mm5 + psubusb %mm3, %mm4 + psubusb %mm2, %mm5 + movq %mm0, %mm1 + paddb %mm5, %mm4 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, (%edi) +.difference_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl difference_pixels_1a_1a +.align 16 +difference_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .difference_pixels_1a_1a_last3 + movl $ 8, %ebx +.difference_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + movq %mm3, %mm5 + psubusb %mm3, %mm4 + psubusb %mm2, %mm5 + movq %mm0, %mm1 + paddb %mm5, %mm4 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .difference_pixels_1a_1a_loop + +.difference_pixels_1a_1a_last3: + test $ 2, %ecx + jz .difference_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + movq %mm3, %mm5 + psubusb %mm3, %mm4 + psubusb %mm2, %mm5 + movq %mm0, %mm1 + paddb %mm5, %mm4 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.difference_pixels_1a_1a_last1: + test $ 1, %ecx + jz .difference_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + + movq %mm2, %mm4 + movq %mm3, %mm5 + psubusb %mm3, %mm4 + psubusb %mm2, %mm5 + movq %mm0, %mm1 + paddb %mm5, %mm4 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, %ebx + movw %bx, (%edi) + +.difference_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + + +.globl multiply_pixels_3a_3a + +.align 16 +multiply_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + movq .mult_shift, %mm7 + pxor %mm6, %mm6 + subl $ 2, %ecx + jl .multiply_pixels_3a_3a_last + movl $ 8, %ebx +.multiply_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + packuswb %mm4, %mm1 + + movq %mm0, %mm4 + pandn %mm1, %mm4 + movq %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .multiply_pixels_3a_3a_loop +.multiply_pixels_3a_3a_last: + test $ 1, %ecx + jz .multiply_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + + + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + packuswb %mm4, %mm1 + + movq %mm0, %mm4 + pandn %mm1, %mm4 + movq %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, (%edi) +.multiply_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl multiply_pixels_1a_1a +.align 16 +multiply_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .multiply_pixels_1a_1a_last3 + movl $ 8, %ebx +.multiply_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + packuswb %mm4, %mm1 + + movq %mm0, %mm4 + pandn %mm1, %mm4 + movq %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .multiply_pixels_1a_1a_loop + +.multiply_pixels_1a_1a_last3: + test $ 2, %ecx + jz .multiply_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + + + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + packuswb %mm4, %mm1 + + movq %mm0, %mm4 + pandn %mm1, %mm4 + movq %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.multiply_pixels_1a_1a_last1: + test $ 1, %ecx + jz .multiply_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + + + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + packuswb %mm4, %mm1 + + movq %mm0, %mm4 + pandn %mm1, %mm4 + movq %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, %ebx + movw %bx, (%edi) + +.multiply_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + +/* Could be perhaps more optimized */ + +.globl darken_pixels_3a_3a + +.align 16 +darken_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + subl $ 2, %ecx + jl .darken_pixels_3a_3a_last + movl $ 8, %ebx +.darken_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + movq %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .darken_pixels_3a_3a_loop +.darken_pixels_3a_3a_last: + test $ 1, %ecx + jz .darken_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + movq %mm2, %mm1 + movd %mm1, (%edi) +.darken_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl darken_pixels_1a_1a +.align 16 +darken_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .darken_pixels_1a_1a_last3 + movl $ 8, %ebx +.darken_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + movq %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .darken_pixels_1a_1a_loop + +.darken_pixels_1a_1a_last3: + test $ 2, %ecx + jz .darken_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + movq %mm2, %mm1 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.darken_pixels_1a_1a_last1: + test $ 1, %ecx + jz .darken_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + movq %mm2, %mm1 + movd %mm1, %ebx + movw %bx, (%edi) + +.darken_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + + +.globl lighten_pixels_3a_3a + +.align 16 +lighten_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + subl $ 2, %ecx + jl .lighten_pixels_3a_3a_last + movl $ 8, %ebx +.lighten_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + paddb %mm4, %mm3 + movq %mm0, %mm1 + pandn %mm3, %mm1 + + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .lighten_pixels_3a_3a_loop +.lighten_pixels_3a_3a_last: + test $ 1, %ecx + jz .lighten_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + paddb %mm4, %mm3 + movq %mm0, %mm1 + pandn %mm3, %mm1 + + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, (%edi) +.lighten_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl lighten_pixels_1a_1a +.align 16 +lighten_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .lighten_pixels_1a_1a_last3 + movl $ 8, %ebx +.lighten_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + paddb %mm4, %mm3 + movq %mm0, %mm1 + pandn %mm3, %mm1 + + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .lighten_pixels_1a_1a_loop + +.lighten_pixels_1a_1a_last3: + test $ 2, %ecx + jz .lighten_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + paddb %mm4, %mm3 + movq %mm0, %mm1 + pandn %mm3, %mm1 + + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.lighten_pixels_1a_1a_last1: + test $ 1, %ecx + jz .lighten_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + paddb %mm4, %mm3 + movq %mm0, %mm1 + pandn %mm3, %mm1 + + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + movd %mm1, %ebx + movw %bx, (%edi) + +.lighten_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + + +.globl screen_pixels_3a_3a + +.align 16 +screen_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + movq .mult_shift, %mm7 + pxor %mm6, %mm6 + subl $ 2, %ecx + jl .screen_pixels_3a_3a_last + movl $ 8, %ebx +.screen_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + + movq %mm4, %mm1 + punpcklbw %mm6, %mm1 + movq %mm5, %mm3 + punpcklbw %mm6, %mm3 + pmullw %mm3, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm1 + psrlw $ 8, %mm1 + + movq %mm4, %mm2 + punpckhbw %mm6, %mm2 + movq %mm5, %mm3 + punpckhbw %mm6, %mm3 + pmullw %mm3, %mm2 + paddw %mm7, %mm2 + movq %mm2, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm2 + psrlw $ 8, %mm2 + + packuswb %mm2, %mm1 + + pcmpeqb %mm3, %mm3 + psubb %mm1, %mm3 + + movq %mm0, %mm1 + pandn %mm3, %mm1 + + movq %mm2, %mm4 + psubusb %mm5, %mm2 + paddb %mm2, %mm5 + pcmpeqb %mm3, %mm3 + psubb %mm5, %mm3 + + pand %mm0, %mm3 + por %mm3, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .screen_pixels_3a_3a_loop +.screen_pixels_3a_3a_last: + test $ 1, %ecx + jz .screen_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + + movq %mm4, %mm1 + punpcklbw %mm6, %mm1 + movq %mm5, %mm3 + punpcklbw %mm6, %mm3 + pmullw %mm3, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm1 + psrlw $ 8, %mm1 + + movq %mm4, %mm2 + punpckhbw %mm6, %mm2 + movq %mm5, %mm3 + punpckhbw %mm6, %mm3 + pmullw %mm3, %mm2 + paddw %mm7, %mm2 + movq %mm2, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm2 + psrlw $ 8, %mm2 + + packuswb %mm2, %mm1 + + pcmpeqb %mm3, %mm3 + psubb %mm1, %mm3 + + movq %mm0, %mm1 + pandn %mm3, %mm1 + + movq %mm2, %mm4 + psubusb %mm5, %mm2 + paddb %mm2, %mm5 + pcmpeqb %mm3, %mm3 + psubb %mm5, %mm3 + + pand %mm0, %mm3 + por %mm3, %mm1 + movd %mm1, (%edi) +.screen_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl screen_pixels_1a_1a +.align 16 +screen_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .screen_pixels_1a_1a_last3 + movl $ 8, %ebx +.screen_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + + movq %mm4, %mm1 + punpcklbw %mm6, %mm1 + movq %mm5, %mm3 + punpcklbw %mm6, %mm3 + pmullw %mm3, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm1 + psrlw $ 8, %mm1 + + movq %mm4, %mm2 + punpckhbw %mm6, %mm2 + movq %mm5, %mm3 + punpckhbw %mm6, %mm3 + pmullw %mm3, %mm2 + paddw %mm7, %mm2 + movq %mm2, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm2 + psrlw $ 8, %mm2 + + packuswb %mm2, %mm1 + + pcmpeqb %mm3, %mm3 + psubb %mm1, %mm3 + + movq %mm0, %mm1 + pandn %mm3, %mm1 + + movq %mm2, %mm4 + psubusb %mm5, %mm2 + paddb %mm2, %mm5 + pcmpeqb %mm3, %mm3 + psubb %mm5, %mm3 + + pand %mm0, %mm3 + por %mm3, %mm1 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .screen_pixels_1a_1a_loop + +.screen_pixels_1a_1a_last3: + test $ 2, %ecx + jz .screen_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + + movq %mm4, %mm1 + punpcklbw %mm6, %mm1 + movq %mm5, %mm3 + punpcklbw %mm6, %mm3 + pmullw %mm3, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm1 + psrlw $ 8, %mm1 + + movq %mm4, %mm2 + punpckhbw %mm6, %mm2 + movq %mm5, %mm3 + punpckhbw %mm6, %mm3 + pmullw %mm3, %mm2 + paddw %mm7, %mm2 + movq %mm2, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm2 + psrlw $ 8, %mm2 + + packuswb %mm2, %mm1 + + pcmpeqb %mm3, %mm3 + psubb %mm1, %mm3 + + movq %mm0, %mm1 + pandn %mm3, %mm1 + + movq %mm2, %mm4 + psubusb %mm5, %mm2 + paddb %mm2, %mm5 + pcmpeqb %mm3, %mm3 + psubb %mm5, %mm3 + + pand %mm0, %mm3 + por %mm3, %mm1 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.screen_pixels_1a_1a_last1: + test $ 1, %ecx + jz .screen_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + + movq %mm4, %mm1 + punpcklbw %mm6, %mm1 + movq %mm5, %mm3 + punpcklbw %mm6, %mm3 + pmullw %mm3, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm1 + psrlw $ 8, %mm1 + + movq %mm4, %mm2 + punpckhbw %mm6, %mm2 + movq %mm5, %mm3 + punpckhbw %mm6, %mm3 + pmullw %mm3, %mm2 + paddw %mm7, %mm2 + movq %mm2, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm2 + psrlw $ 8, %mm2 + + packuswb %mm2, %mm1 + + pcmpeqb %mm3, %mm3 + psubb %mm1, %mm3 + + movq %mm0, %mm1 + pandn %mm3, %mm1 + + movq %mm2, %mm4 + psubusb %mm5, %mm2 + paddb %mm2, %mm5 + pcmpeqb %mm3, %mm3 + psubb %mm5, %mm3 + + pand %mm0, %mm3 + por %mm3, %mm1 + movd %mm1, %ebx + movw %bx, (%edi) + +.screen_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + +.lower_ff: .int 0x00FF00FF, 0x00FF00FF + + +.globl overlay_pixels_3a_3a + +.align 16 +overlay_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + + movq .mult_shift, %mm7 + pxor %mm6, %mm6 + subl $ 2, %ecx + jl .overlay_pixels_3a_3a_last + movl $ 8, %ebx +.overlay_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + call op_overlay + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .overlay_pixels_3a_3a_loop +.overlay_pixels_3a_3a_last: + test $ 1, %ecx + jz .overlay_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + call op_overlay + movd %mm1, (%edi) +.overlay_pixels_3a_3a_end: + + emms + popl %ebx + popl %edi + ret + +.globl overlay_pixels_1a_1a +.align 16 +overlay_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .overlay_pixels_1a_1a_last3 + movl $ 8, %ebx +.overlay_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + call op_overlay + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .overlay_pixels_1a_1a_loop + +.overlay_pixels_1a_1a_last3: + test $ 2, %ecx + jz .overlay_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + call op_overlay + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.overlay_pixels_1a_1a_last1: + test $ 1, %ecx + jz .overlay_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + call op_overlay + movd %mm1, %ebx + movw %bx, (%edi) + +.overlay_pixels_1a_1a_end: + + emms + popl %ebx + popl %edi + ret + +op_overlay: + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + punpcklbw %mm6, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + movq .lower_ff, %mm5 + psubw %mm4, %mm5 + + psubw %mm1, %mm5 + movq %mm2, %mm4 + punpcklbw %mm6, %mm4 + pmullw %mm4, %mm5 + paddw %mm7, %mm5 + movq %mm5, %mm4 + psrlw $ 8, %mm4 + paddw %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm1, %mm5 + + subl $ 8, %esp + movq %mm5, (%esp) + + movq %mm2, %mm1 + punpckhbw %mm6, %mm1 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + punpckhbw %mm6, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + movq .lower_ff, %mm5 + psubw %mm4, %mm5 + + psubw %mm1, %mm5 + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + pmullw %mm4, %mm5 + paddw %mm7, %mm5 + movq %mm5, %mm4 + psrlw $ 8, %mm4 + paddw %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm1, %mm5 + + movq (%esp), %mm4 + addl $ 8, %esp + + packuswb %mm5, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + ret \ No newline at end of file diff --git a/app/arch/i386/mmx/paint_funcs_mmx.S.m4 b/app/arch/i386/mmx/paint_funcs_mmx.S.m4 new file mode 100644 index 0000000000..662774c7ae --- /dev/null +++ b/app/arch/i386/mmx/paint_funcs_mmx.S.m4 @@ -0,0 +1,360 @@ +/* +MMX code to supplement some functions in paint_funcs.c +for the Gimp. + +Copyright (C) 1999, 2001 David Monniaux +*/ + +.text +.align 4 + +.globl intel_cpu_features + +intel_cpu_features: + pushl %ebx + pushfl + popl %eax + xor $ 0x200000, %eax + pushl %eax + popfl + pushfl + popl %edx + xor %eax, %edx + xor %eax, %eax + test $ 0x200000, %edx + jnz .intel_cpu_features_end + movl $ 1, %eax + cpuid + movl %edx, %eax +.intel_cpu_features_end: + popl %ebx + ret + +.alpha_mask_1a: .int 0xFF00FF00, 0xFF00FF00 +.mult_shift: .int 0x00800080, 0x00800080 +.alpha_mask_3a: .int 0xFF000000, 0xFF000000 + +define(`MMX_PIXEL_OP_3A_1A', ` +.globl $1_pixels_3a_3a + +.align 16 +$1_pixels_3a_3a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_3a, %mm0 + $2 + subl $ 2, %ecx + jl .$1_pixels_3a_3a_last + movl $ 8, %ebx +.$1_pixels_3a_3a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + $3 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 2, %ecx + jge .$1_pixels_3a_3a_loop +.$1_pixels_3a_3a_last: + test $ 1, %ecx + jz .$1_pixels_3a_3a_end + movd (%eax), %mm2 + movd (%edx), %mm3 + $3 + movd %mm1, (%edi) +.$1_pixels_3a_3a_end: + $4 + emms + popl %ebx + popl %edi + ret + +.globl $1_pixels_1a_1a +.align 16 +$1_pixels_1a_1a: + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movq .alpha_mask_1a, %mm0 + subl $ 4, %ecx + jl .$1_pixels_1a_1a_last3 + movl $ 8, %ebx +.$1_pixels_1a_1a_loop: + movq (%eax), %mm2 + movq (%edx), %mm3 + $3 + movq %mm1, (%edi) + addl %ebx, %eax + addl %ebx, %edx + addl %ebx, %edi + subl $ 4, %ecx + jge .$1_pixels_1a_1a_loop + +.$1_pixels_1a_1a_last3: + test $ 2, %ecx + jz .$1_pixels_1a_1a_last1 + movd (%eax), %mm2 + movd (%edx), %mm3 + $3 + addl $ 4, %eax + addl $ 4, %edx + addl $ 4, %edi + +.$1_pixels_1a_1a_last1: + test $ 1, %ecx + jz .$1_pixels_1a_1a_end + + movw (%eax), %bx + movd %ebx, %mm2 + movw (%edx), %bx + movd %ebx, %mm3 + $3 + movd %mm1, %ebx + movw %bx, (%edi) + +.$1_pixels_1a_1a_end: + $4 + emms + popl %ebx + popl %edi + ret') + +/* min(a,b) = a - max(a-b, 0) */ +MMX_PIXEL_OP_3A_1A(`add', `', ` + movq %mm2, %mm4 + paddusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1', `') + +MMX_PIXEL_OP_3A_1A(`substract', `', ` + movq %mm2, %mm4 + psubusb %mm3, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1', `') + +MMX_PIXEL_OP_3A_1A(`difference', `', ` + movq %mm2, %mm4 + movq %mm3, %mm5 + psubusb %mm3, %mm4 + psubusb %mm2, %mm5 + movq %mm0, %mm1 + paddb %mm5, %mm4 + pandn %mm4, %mm1 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1', `') + +MMX_PIXEL_OP_3A_1A(`multiply', ` + movq .mult_shift, %mm7 + pxor %mm6, %mm6',` + + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + packuswb %mm4, %mm1 + + movq %mm0, %mm4 + pandn %mm1, %mm4 + movq %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1', `') + +/* Could be perhaps more optimized */ +MMX_PIXEL_OP_3A_1A(`darken', `', ` + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + movq %mm2, %mm1', `') + +MMX_PIXEL_OP_3A_1A(`lighten', `', ` + movq %mm2, %mm4 + psubusb %mm3, %mm4 + paddb %mm4, %mm3 + movq %mm0, %mm1 + pandn %mm3, %mm1 + + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1', `') + +MMX_PIXEL_OP_3A_1A(`screen', ` + movq .mult_shift, %mm7 + pxor %mm6, %mm6',` + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + + movq %mm4, %mm1 + punpcklbw %mm6, %mm1 + movq %mm5, %mm3 + punpcklbw %mm6, %mm3 + pmullw %mm3, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm1 + psrlw $ 8, %mm1 + + movq %mm4, %mm2 + punpckhbw %mm6, %mm2 + movq %mm5, %mm3 + punpckhbw %mm6, %mm3 + pmullw %mm3, %mm2 + paddw %mm7, %mm2 + movq %mm2, %mm3 + psrlw $ 8, %mm3 + paddw %mm3, %mm2 + psrlw $ 8, %mm2 + + packuswb %mm2, %mm1 + + pcmpeqb %mm3, %mm3 + psubb %mm1, %mm3 + + movq %mm0, %mm1 + pandn %mm3, %mm1 + + movq %mm2, %mm4 + psubusb %mm5, %mm2 + paddb %mm2, %mm5 + pcmpeqb %mm3, %mm3 + psubb %mm5, %mm3 + + pand %mm0, %mm3 + por %mm3, %mm1', `') + +.lower_ff: .int 0x00FF00FF, 0x00FF00FF + +MMX_PIXEL_OP_3A_1A(`overlay', ` + movq .mult_shift, %mm7 + pxor %mm6, %mm6 ', + `call op_overlay', `') + +op_overlay: + movq %mm2, %mm1 + punpcklbw %mm6, %mm1 + movq %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + punpcklbw %mm6, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + punpcklbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + movq .lower_ff, %mm5 + psubw %mm4, %mm5 + + psubw %mm1, %mm5 + movq %mm2, %mm4 + punpcklbw %mm6, %mm4 + pmullw %mm4, %mm5 + paddw %mm7, %mm5 + movq %mm5, %mm4 + psrlw $ 8, %mm4 + paddw %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm1, %mm5 + + subl $ 8, %esp + movq %mm5, (%esp) + + movq %mm2, %mm1 + punpckhbw %mm6, %mm1 + movq %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm1 + paddw %mm7, %mm1 + movq %mm1, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm1 + psrlw $ 8, %mm1 + + pcmpeqb %mm4, %mm4 + psubb %mm2, %mm4 + punpckhbw %mm6, %mm4 + pcmpeqb %mm5, %mm5 + psubb %mm3, %mm5 + punpckhbw %mm6, %mm5 + pmullw %mm5, %mm4 + paddw %mm7, %mm4 + movq %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm5, %mm4 + psrlw $ 8, %mm4 + + movq .lower_ff, %mm5 + psubw %mm4, %mm5 + + psubw %mm1, %mm5 + movq %mm2, %mm4 + punpckhbw %mm6, %mm4 + pmullw %mm4, %mm5 + paddw %mm7, %mm5 + movq %mm5, %mm4 + psrlw $ 8, %mm4 + paddw %mm4, %mm5 + psrlw $ 8, %mm5 + paddw %mm1, %mm5 + + movq (%esp), %mm4 + addl $ 8, %esp + + packuswb %mm5, %mm4 + movq %mm0, %mm1 + pandn %mm4, %mm1 + + movq %mm2, %mm4 + psubusb %mm3, %mm4 + psubb %mm4, %mm2 + pand %mm0, %mm2 + por %mm2, %mm1 + ret \ No newline at end of file diff --git a/app/main.c b/app/main.c index e4829acd21..62e86ad167 100644 --- a/app/main.c +++ b/app/main.c @@ -78,6 +78,12 @@ gboolean use_debug_handler = FALSE; gboolean console_messages = FALSE; gboolean restore_session = FALSE; gboolean double_speed = FALSE; +gboolean use_mmx = FALSE; + +/* TODO: this should probably go into a header file */ +#ifdef USE_GCC_INTEL_MMX +unsigned long intel_cpu_features(void); +#endif MessageHandlerType message_handler = CONSOLE; @@ -149,6 +155,11 @@ main (int argc, use_shm = TRUE; #endif +#ifdef HAVE_ASM_MMX + use_mmx = (intel_cpu_features() & (1 << 23)) ? 1 : 0; + fprintf(stderr, "MMX : %s\n", use_mmx ? "yes" : "no"); +#endif + batch_cmds = g_new (char *, argc); batch_cmds[0] = NULL; diff --git a/app/paint-funcs/paint-funcs-simd.S b/app/paint-funcs/paint-funcs-simd.S new file mode 100644 index 0000000000..81db2b1a4b --- /dev/null +++ b/app/paint-funcs/paint-funcs-simd.S @@ -0,0 +1,5 @@ +#include "config.h" + +#ifdef HAVE_ASM_MMX +#include +#endif \ No newline at end of file diff --git a/app/paint-funcs/paint-funcs.c b/app/paint-funcs/paint-funcs.c index 41193ab5ca..3e6da1939c 100644 --- a/app/paint-funcs/paint-funcs.c +++ b/app/paint-funcs/paint-funcs.c @@ -58,7 +58,6 @@ #define INT_BLEND(a,b,alpha,tmp) (INT_MULT((a)-(b), alpha, tmp) + (b)) - typedef enum { MinifyX_MinifyY, @@ -153,7 +152,41 @@ static void apply_layer_mode_replace (guchar *src1, gboolean *affect); static void rotate_pointers (gpointer *p, guint32 n); +/* MMX stuff */ +extern gboolean use_mmx; +#define USE_GCC_INTEL_MMX + +#ifdef USE_GCC_INTEL_MMX +extern int use_mmx; + +#define MMX_PIXEL_OP(x) \ +void \ +x( \ + const unsigned char *src1, \ + const unsigned char *src2, \ + unsigned count, \ + unsigned char *dst) __attribute((regparm(3))); + +#define MMX_PIXEL_OP_3A_1A(op) \ + MMX_PIXEL_OP(op##_pixels_3a_3a) \ + MMX_PIXEL_OP(op##_pixels_1a_1a) + +#define USE_MMX_PIXEL_OP_3A_1A(op) \ + if (use_mmx && has_alpha1 && has_alpha2) \ + { \ + if (bytes1==2 && bytes2==2) \ + return op##_pixels_1a_1a(src1, src2, length, dest); \ + if (bytes1==4 && bytes2==4) \ + return op##_pixels_3a_3a(src1, src2, length, dest); \ + } \ + /*fprintf(stderr, "non-MMX: %s(%d, %d, %d, %d)\n", #op, \ + bytes1, bytes2, has_alpha1, has_alpha2);*/ +#else + +#define MMX_PIXEL_OP_3A_1A(op) +#define USE_MMX_PIXEL_OP_3A_1A(op) +#endif void @@ -715,6 +748,7 @@ extract_alpha_pixels (const guchar *src, } } +MMX_PIXEL_OP_3A_1A(darken) void darken_pixels (const guchar *src1, const guchar *src2, @@ -728,6 +762,8 @@ darken_pixels (const guchar *src1, gint b, alpha; guchar s1, s2; + USE_MMX_PIXEL_OP_3A_1A(darken) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length--) @@ -750,7 +786,7 @@ darken_pixels (const guchar *src1, } } - +MMX_PIXEL_OP_3A_1A(lighten) void lighten_pixels (const guchar *src1, const guchar *src2, @@ -764,6 +800,8 @@ lighten_pixels (const guchar *src1, gint b, alpha; guchar s1, s2; + USE_MMX_PIXEL_OP_3A_1A(lighten) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length--) @@ -881,6 +919,7 @@ color_only_pixels (const guchar *src1, } } +MMX_PIXEL_OP_3A_1A(multiply) void multiply_pixels (const guchar *src1, const guchar *src2, @@ -894,6 +933,8 @@ multiply_pixels (const guchar *src1, gint alpha, b; gint tmp; + USE_MMX_PIXEL_OP_3A_1A(multiply) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; if (has_alpha1 && has_alpha2) @@ -973,6 +1014,8 @@ divide_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(screen) + void screen_pixels (const guchar *src1, const guchar *src2, @@ -986,6 +1029,8 @@ screen_pixels (const guchar *src1, gint alpha, b; gint tmp; + USE_MMX_PIXEL_OP_3A_1A(screen) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) @@ -1005,6 +1050,8 @@ screen_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(overlay) + void overlay_pixels (const guchar *src1, const guchar *src2, @@ -1153,6 +1200,8 @@ hardlight_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(add) + void add_pixels (const guchar *src1, const guchar *src2, @@ -1165,6 +1214,8 @@ add_pixels (const guchar *src1, { gint alpha, b; + USE_MMX_PIXEL_OP_3A_1A(add) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) @@ -1187,6 +1238,8 @@ add_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(substract) + void subtract_pixels (const guchar *src1, const guchar *src2, @@ -1200,6 +1253,8 @@ subtract_pixels (const guchar *src1, gint alpha, b; gint diff; + USE_MMX_PIXEL_OP_3A_1A(substract) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) @@ -1222,6 +1277,8 @@ subtract_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(difference) + void difference_pixels (const guchar *src1, const guchar *src2, @@ -1235,6 +1292,8 @@ difference_pixels (const guchar *src1, gint alpha, b; gint diff; + USE_MMX_PIXEL_OP_3A_1A(difference) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) diff --git a/app/paint_funcs.c b/app/paint_funcs.c index 41193ab5ca..3e6da1939c 100644 --- a/app/paint_funcs.c +++ b/app/paint_funcs.c @@ -58,7 +58,6 @@ #define INT_BLEND(a,b,alpha,tmp) (INT_MULT((a)-(b), alpha, tmp) + (b)) - typedef enum { MinifyX_MinifyY, @@ -153,7 +152,41 @@ static void apply_layer_mode_replace (guchar *src1, gboolean *affect); static void rotate_pointers (gpointer *p, guint32 n); +/* MMX stuff */ +extern gboolean use_mmx; +#define USE_GCC_INTEL_MMX + +#ifdef USE_GCC_INTEL_MMX +extern int use_mmx; + +#define MMX_PIXEL_OP(x) \ +void \ +x( \ + const unsigned char *src1, \ + const unsigned char *src2, \ + unsigned count, \ + unsigned char *dst) __attribute((regparm(3))); + +#define MMX_PIXEL_OP_3A_1A(op) \ + MMX_PIXEL_OP(op##_pixels_3a_3a) \ + MMX_PIXEL_OP(op##_pixels_1a_1a) + +#define USE_MMX_PIXEL_OP_3A_1A(op) \ + if (use_mmx && has_alpha1 && has_alpha2) \ + { \ + if (bytes1==2 && bytes2==2) \ + return op##_pixels_1a_1a(src1, src2, length, dest); \ + if (bytes1==4 && bytes2==4) \ + return op##_pixels_3a_3a(src1, src2, length, dest); \ + } \ + /*fprintf(stderr, "non-MMX: %s(%d, %d, %d, %d)\n", #op, \ + bytes1, bytes2, has_alpha1, has_alpha2);*/ +#else + +#define MMX_PIXEL_OP_3A_1A(op) +#define USE_MMX_PIXEL_OP_3A_1A(op) +#endif void @@ -715,6 +748,7 @@ extract_alpha_pixels (const guchar *src, } } +MMX_PIXEL_OP_3A_1A(darken) void darken_pixels (const guchar *src1, const guchar *src2, @@ -728,6 +762,8 @@ darken_pixels (const guchar *src1, gint b, alpha; guchar s1, s2; + USE_MMX_PIXEL_OP_3A_1A(darken) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length--) @@ -750,7 +786,7 @@ darken_pixels (const guchar *src1, } } - +MMX_PIXEL_OP_3A_1A(lighten) void lighten_pixels (const guchar *src1, const guchar *src2, @@ -764,6 +800,8 @@ lighten_pixels (const guchar *src1, gint b, alpha; guchar s1, s2; + USE_MMX_PIXEL_OP_3A_1A(lighten) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length--) @@ -881,6 +919,7 @@ color_only_pixels (const guchar *src1, } } +MMX_PIXEL_OP_3A_1A(multiply) void multiply_pixels (const guchar *src1, const guchar *src2, @@ -894,6 +933,8 @@ multiply_pixels (const guchar *src1, gint alpha, b; gint tmp; + USE_MMX_PIXEL_OP_3A_1A(multiply) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; if (has_alpha1 && has_alpha2) @@ -973,6 +1014,8 @@ divide_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(screen) + void screen_pixels (const guchar *src1, const guchar *src2, @@ -986,6 +1029,8 @@ screen_pixels (const guchar *src1, gint alpha, b; gint tmp; + USE_MMX_PIXEL_OP_3A_1A(screen) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) @@ -1005,6 +1050,8 @@ screen_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(overlay) + void overlay_pixels (const guchar *src1, const guchar *src2, @@ -1153,6 +1200,8 @@ hardlight_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(add) + void add_pixels (const guchar *src1, const guchar *src2, @@ -1165,6 +1214,8 @@ add_pixels (const guchar *src1, { gint alpha, b; + USE_MMX_PIXEL_OP_3A_1A(add) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) @@ -1187,6 +1238,8 @@ add_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(substract) + void subtract_pixels (const guchar *src1, const guchar *src2, @@ -1200,6 +1253,8 @@ subtract_pixels (const guchar *src1, gint alpha, b; gint diff; + USE_MMX_PIXEL_OP_3A_1A(substract) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) @@ -1222,6 +1277,8 @@ subtract_pixels (const guchar *src1, } +MMX_PIXEL_OP_3A_1A(difference) + void difference_pixels (const guchar *src1, const guchar *src2, @@ -1235,6 +1292,8 @@ difference_pixels (const guchar *src1, gint alpha, b; gint diff; + USE_MMX_PIXEL_OP_3A_1A(difference) + alpha = (has_alpha1 || has_alpha2) ? MAX (bytes1, bytes2) - 1 : bytes1; while (length --) diff --git a/app/paint_funcs_simd.S b/app/paint_funcs_simd.S new file mode 100644 index 0000000000..81db2b1a4b --- /dev/null +++ b/app/paint_funcs_simd.S @@ -0,0 +1,5 @@ +#include "config.h" + +#ifdef HAVE_ASM_MMX +#include +#endif \ No newline at end of file diff --git a/configure.in b/configure.in index 0179b35c6d..9cfaf2a462 100644 --- a/configure.in +++ b/configure.in @@ -160,6 +160,50 @@ CPPFLAGS="$CPPFLAGS $GTK_CFLAGS" LDFLAGS="$LDFLAGS `echo $GTK_LIBS | sed 's/\(.*\)\(-lgtk.*\)/\1/'`" LIBS="$LIBS $GTK_LIBS" +dnl Test for MMX stuff +have_asm_mmx=false +AC_MSG_CHECKING([for Intel Pentium architecture (IA32)]) +if test "$host_cpu" = "i386" -o "$host_cpu" = "i486"\ + -o "$host_cpu" = "i586" -o "$host_cpu" = "i586"\ + -o "$host_cpu" = "i686" -o "$host_cpu" = "i786" ; +then + AC_MSG_RESULT(yes) + AC_MSG_CHECKING([for support for gcc-style register parameters on Intel]) + AC_TRY_COMPILE([], + [extern void x( + const unsigned char *src1, + const unsigned char *src2, + unsigned count, + unsigned char *dst) __attribute((regparm(3)));], + + [AC_MSG_RESULT(yes) + + AC_MSG_CHECKING([for support for MMX in assembly code]) + cat > conftest.S <&AC_FD_CC + cat conftest.S >&AC_FD_CC + rm -rf conftest.* ; + fi + ], + [AC_MSG_RESULT(no) + AC_MSG_WARN(*** C compiler does not support __attribute((regparm(3))), MMX code will not be built)]); +else + AC_MSG_RESULT(no) ; +fi +AM_CONDITIONAL(HAVE_ASM_MMX, test x$have_asm_mmx = xtrue) + dnl Test for Xmu if test -z "$LIBXMU"; then AC_CHECK_LIB(Xmu, XmuClientWindow,