gimp/app/composite/gimp-composite-x86.h

245 lines
14 KiB
C

/* GIMP - The GNU Image Manipulation Program
* Copyright (C) 1995 Spencer Kimball and Peter Mattis
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#if __GNUC__ >= 3
/*
* Convert the low 8bit byte of the src to 16bit words in dst.
*/
#define mmx_low_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"\n" \
"\tpunpcklbw %%"#zero", %%"#dst"\n"
/*
* Convert the high 8bit byte of the src to 16bit words in dst.
*/
#define mmx_high_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"\n" \
"\tpunpckhbw %%"#zero", %%"#dst"\n"
#define xmm_low_bytes_to_words(src,dst,zero) \
"\tmovdqu %%"#src", %%"#dst"; " \
"\tpunpcklbw %%"#zero", %%"#dst"\n"
#define xmm_high_bytes_to_words(src,dst,zero) \
"\tmovdqu %%"#src", %%"#dst"; " \
"\tpunpckhbw %%"#zero", %%"#dst"\n"
/* a = INT_MULT(a,b) */
#define mmx_int_mult(a,b,w128) \
"\tpmullw %%"#b", %%"#a"; " \
"\tpaddw %%"#w128", %%"#a"; " \
"\tmovq %%"#a", %%"#b"; " \
"\tpsrlw $8, %%"#b"; " \
"\tpaddw %%"#a", %%"#b"; " \
"\tpsrlw $8, %%"#b"\n"
#define sse2_int_mult(a,b,w128) \
"\tpmullw %%"#b", %%"#a"; " \
"\tpaddw %%"#w128", %%"#a"; " \
"\tmovdqu %%"#a", %%"#b"; " \
"\tpsrlw $8, %%"#b"; " \
"\tpaddw %%"#a", %%"#b"; " \
"\tpsrlw $8, %%"#b"\n"
/*
* Double-word divide. Adjusted for subsequent unsigned packing
* (high-order bit of each word is cleared)
* Clobbers eax, ecx edx
*/
#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax\n" \
"movd %%" #divisor ",%%ecx\n" \
"xorl %%edx,%%edx\n" \
"divw %%cx\n" \
"roll $16, %%eax\n" \
"roll $16, %%ecx\n" \
"xorl %%edx,%%edx\n" \
"divw %%cx\n" \
"btr $15, %%eax\n" \
"roll $16, %%eax\n" \
"btr $15, %%eax\n" \
"movd %%eax,%%" #quotient "\n"
/*
* Quadword divide. No adjustment for subsequent unsigned packing
* (high-order bit of each word is left alone)
* clobber list must include: "%eax", "%ecx", "%edx", divisor quotient
*/
#define pdivwqX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"movd %%eax,%%" #quotient "; " \
"psrlq $32,%%" #dividend ";" \
"psrlq $32,%%" #divisor ";" \
"movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"movd %%eax,%%" #divisor ";" \
"psllq $32,%%" #divisor ";" \
"por %%" #divisor ",%%" #quotient ";"
#define pdivwqX_clobber "%eax", "%ecx", "%edx", "%cc"
/*
* Quadword divide. Adjusted for subsequent unsigned packing
* (high-order bit of each word is cleared)
*/
#define pdivwuqX(dividend,divisor,quotient) \
pdivwX(dividend,divisor,quotient) \
"psrlq $32,%%" #dividend ";" \
"psrlq $32,%%" #divisor ";" \
pdivwX(dividend,divisor,quotient) \
"movd %%eax,%%" #divisor ";" \
"psllq $32,%%" #divisor ";" \
"por %%" #divisor ",%%" #quotient ";"
#define pdivwuqX_clobber pdivwqX_clobber
#define xmm_pdivwqX(dividend,divisor,quotient,scratch) "movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"movd %%eax,%%" #quotient "; " \
"psrlq $32,%%" #divisor ";" \
"psrlq $32,%%" #dividend ";" \
"movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"movd %%eax,%%" #scratch ";" \
"psllq $32,%%" #scratch ";" \
"psrlq $32,%%" #divisor ";" \
"psrlq $32,%%" #dividend ";" \
"movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"movd %%eax,%%" #scratch ";" \
"psllq $64,%%" #scratch ";" \
"psrlq $32,%%" #divisor ";" \
"psrlq $32,%%" #dividend ";" \
"movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"movd %%eax,%%" #scratch ";" \
"psllq $96,%%" #scratch ";" \
"por %%" #scratch ",%%" #quotient ";"
#define xmm_pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"btr $15, %%eax; " \
"roll $16, %%eax; " \
"btr $15, %%eax; " \
"movd %%eax,%%" #quotient ";"
#define xmm_pdivwuqX(dividend,divisor,quotient,scratch) \
xmm_pdivwX(dividend,divisor,scratch) \
"movd %%"#scratch ",%%"#quotient ";" \
"psrlq $32,%%"#dividend ";" \
"psrlq $32,%%"#divisor ";" \
xmm_pdivwX(dividend,divisor,scratch) \
"psllq $32,%%"#scratch ";" \
"por %%"#scratch ",%%"#quotient ";" \
"psrlq $32,%%"#dividend ";" \
"psrlq $32,%%"#divisor ";" \
xmm_pdivwX(dividend,divisor,scratch) \
"psllq $64,%%"#scratch ";" \
"por %%"#scratch ",%%"#quotient ";" \
"psrlq $32,%%"#dividend ";" \
"psrlq $32,%%"#divisor ";" \
xmm_pdivwX(dividend,divisor,scratch) \
"psllq $96,%%"#scratch ";" \
"por %%"#scratch ",%%"#quotient
/* equivalent to the INT_MULT() macro in gimp-composite-generic.c */
/*
* opr2 = INT_MULT(opr1, opr2, t)
*
* Operates across quad-words using x86 word (16bit) value.
* Result is left in opr2
*
* opr1 = opr1 * opr2 + w128
* opr2 = opr1
* opr2 = ((opr2 >> 8) + opr1) >> 8
*/
#define pmulwX(opr1,opr2,w128) \
"\tpmullw %%"#opr2", %%"#opr1"; " \
"\tpaddw %%"#w128", %%"#opr1"; " \
"\tmovq %%"#opr1", %%"#opr2"; " \
"\tpsrlw $8, %%"#opr2"; " \
"\tpaddw %%"#opr1", %%"#opr2"; " \
"\tpsrlw $8, %%"#opr2"\n"
#define xmm_pmulwX(opr1,opr2,w128) \
"\tpmullw %%"#opr2", %%"#opr1"; " \
"\tpaddw %%"#w128", %%"#opr1"; " \
"\tmovdqu %%"#opr1", %%"#opr2"; " \
"\tpsrlw $8, %%"#opr2"; " \
"\tpaddw %%"#opr1", %%"#opr2"; " \
"\tpsrlw $8, %%"#opr2"\n"
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned long uint32;
typedef unsigned long long uint64;
typedef struct { uint64 __uint64[2]; } uint128;
extern const guint32 va8_alpha_mask[2];
extern const guint32 va8_b255[2];
extern const guint32 va8_w1[2];
extern const guint32 va8_w255[2];
#endif /* __GNUC__ >= 3 */