gimp/app/composite/gimp-composite-x86.h

/* -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*-
 *
 * The GIMP -- an image manipulation program
 * Copyright (C) 1995 Spencer Kimball and Peter Mattis
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#if __GNUC__ >= 3

/*
 * Convert the low 8bit byte of the src to 16bit words in dst.
 */
#define mmx_low_bytes_to_words(src,dst,zero) \
         "\tmovq      %%"#src", %%"#dst"; " \
         "\tpunpcklbw %%"#zero", %%"#dst"\n"

/*
 * Convert the high 8bit byte of the src to 16bit words in dst.
 */
#define mmx_high_bytes_to_words(src,dst,zero) \
         "\tmovq      %%"#src", %%"#dst"; " \
         "\tpunpckhbw %%"#zero", %%"#dst"\n"

#define xmm_low_bytes_to_words(src,dst,zero) \
         "\tmovdqu     %%"#src", %%"#dst"; " \
         "\tpunpcklbw %%"#zero", %%"#dst"\n"

#define xmm_high_bytes_to_words(src,dst,zero) \
         "\tmovdqu     %%"#src", %%"#dst"; " \
         "\tpunpckhbw %%"#zero", %%"#dst"\n"

/* a = INT_MULT(a,b) */
#define mmx_int_mult(a,b,w128) \
                  "\tpmullw    %%"#b",    %%"#a"; " \
                  "\tpaddw     %%"#w128", %%"#a"; " \
                  "\tmovq      %%"#a",    %%"#b"; " \
                  "\tpsrlw     $8,        %%"#b"; " \
                  "\tpaddw     %%"#a",    %%"#b"; " \
                  "\tpsrlw     $8,        %%"#b"\n"

#define sse2_int_mult(a,b,w128) \
                  "\tpmullw    %%"#b",    %%"#a"; " \
                  "\tpaddw     %%"#w128", %%"#a"; " \
                  "\tmovdqu    %%"#a",    %%"#b"; " \
                  "\tpsrlw     $8,        %%"#b"; " \
                  "\tpaddw     %%"#a",    %%"#b"; " \
                  "\tpsrlw     $8,        %%"#b"\n"

/*
 * Double-word divide.  Adjusted for subsequent unsigned packing
 * (high-order bit of each word is cleared)
 * Clobbers eax, ecx edx
 */
#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \
                                          "movd %%" #divisor  ",%%ecx; " \
                                          "xorl %%edx,%%edx; "           \
                                          "divw %%cx; "                  \
                                          "roll $16, %%eax; "            \
                                          "roll $16, %%ecx; "            \
                                          "xorl %%edx,%%edx; "           \
                                          "divw %%cx; "                  \
                                          "btr $15, %%eax; "             \
                                          "roll $16, %%eax; "            \
                                          "btr $15, %%eax; "             \
                                          "movd %%eax,%%" #quotient ";"


/*
 * Quadword divide.  No adjustment for subsequent unsigned packing
 * (high-order bit of each word is left alone)
 * clobber list must include: "%eax", "%ecx", "%edx", divisor quotient
 */
#define pdivwqX(dividend,divisor,quotient) "movd   %%" #dividend ",%%eax; " \
                                          "movd   %%" #divisor  ",%%ecx; " \
                                          "xorl   %%edx,%%edx; "           \
                                          "divw   %%cx; "                  \
                                          "roll   $16, %%eax; "            \
                                          "roll   $16, %%ecx; "            \
                                          "xorl   %%edx,%%edx; "           \
                                          "divw   %%cx; "                  \
                                          "roll   $16, %%eax; "            \
                                          "movd   %%eax,%%" #quotient "; " \
                                          "psrlq $32,%%" #dividend ";"     \
                                          "psrlq $32,%%" #divisor ";"      \
                                          "movd   %%" #dividend ",%%eax; " \
                                          "movd   %%" #divisor  ",%%ecx; " \
                                          "xorl   %%edx,%%edx; "           \
                                          "divw   %%cx; "                  \
                                          "roll   $16, %%eax; "            \
                                          "roll   $16, %%ecx; "            \
                                          "xorl   %%edx,%%edx; "           \
                                          "divw   %%cx; "                  \
                                          "roll   $16, %%eax; "            \
                                          "movd   %%eax,%%" #divisor ";"   \
                                          "psllq  $32,%%" #divisor ";"     \
                                          "por    %%" #divisor ",%%" #quotient ";"
#define pdivwqX_clobber "%eax", "%ecx", "%edx", "%cc"

/*
 * Quadword divide.  Adjusted for subsequent unsigned packing
 * (high-order bit of each word is cleared)
 */
#define pdivwuqX(dividend,divisor,quotient) \
                                          pdivwX(dividend,divisor,quotient) \
                                            "psrlq  $32,%%" #dividend ";"   \
                                            "psrlq  $32,%%" #divisor ";"    \
                                          pdivwX(dividend,divisor,quotient) \
                                          "movd   %%eax,%%" #divisor ";"    \
                                            "psllq  $32,%%" #divisor ";"    \
                                            "por    %%" #divisor ",%%" #quotient ";"
#define pdivwuqX_clobber pdivwqX_clobber

#define xmm_pdivwqX(dividend,divisor,quotient,scratch) "movd   %%" #dividend ",%%eax; " \
                                                       "movd   %%" #divisor  ",%%ecx; " \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "roll   $16, %%ecx; "            \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "movd   %%eax,%%" #quotient "; " \
                                                       "psrlq $32,%%" #divisor ";"      \
                                                       "psrlq $32,%%" #dividend ";"     \
                                                       "movd   %%" #dividend ",%%eax; " \
                                                       "movd   %%" #divisor  ",%%ecx; " \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "roll   $16, %%ecx; "            \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "movd   %%eax,%%" #scratch ";"   \
                                                       "psllq  $32,%%" #scratch ";"     \
                                                       "psrlq $32,%%" #divisor ";"      \
                                                       "psrlq $32,%%" #dividend ";"     \
                                                       "movd   %%" #dividend ",%%eax; " \
                                                       "movd   %%" #divisor  ",%%ecx; " \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "roll   $16, %%ecx; "            \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "movd   %%eax,%%" #scratch ";"   \
                                                       "psllq  $64,%%" #scratch ";"     \
                                                       "psrlq $32,%%" #divisor ";"      \
                                                       "psrlq $32,%%" #dividend ";"     \
                                                       "movd   %%" #dividend ",%%eax; " \
                                                       "movd   %%" #divisor  ",%%ecx; " \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "roll   $16, %%ecx; "            \
                                                       "xorl   %%edx,%%edx; "           \
                                                       "divw   %%cx; "                  \
                                                       "roll   $16, %%eax; "            \
                                                       "movd   %%eax,%%" #scratch ";"   \
                                                       "psllq  $96,%%" #scratch ";"     \
                                                       "por    %%" #scratch ",%%" #quotient ";"

#define xmm_pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \
                                              "movd %%" #divisor  ",%%ecx; " \
                                              "xorl %%edx,%%edx; "           \
                                              "divw %%cx; "                  \
                                              "roll $16, %%eax; "            \
                                              "roll $16, %%ecx; "            \
                                              "xorl %%edx,%%edx; "           \
                                              "divw %%cx; "                  \
                                              "btr $15, %%eax; "             \
                                              "roll $16, %%eax; "            \
                                              "btr $15, %%eax; "             \
                                              "movd %%eax,%%" #quotient ";"

#define xmm_pdivwuqX(dividend,divisor,quotient,scratch) \
                                          xmm_pdivwX(dividend,divisor,scratch)      \
                                            "movd   %%"#scratch ",%%"#quotient  ";" \
                                          "psrlq  $32,%%"#dividend              ";" \
                                          "psrlq  $32,%%"#divisor               ";" \
                                          xmm_pdivwX(dividend,divisor,scratch)      \
                                            "psllq  $32,%%"#scratch             ";" \
                                            "por    %%"#scratch ",%%"#quotient  ";" \
                                          "psrlq  $32,%%"#dividend              ";" \
                                          "psrlq  $32,%%"#divisor               ";" \
                                          xmm_pdivwX(dividend,divisor,scratch)      \
                                            "psllq  $64,%%"#scratch             ";" \
                                            "por    %%"#scratch ",%%"#quotient  ";" \
                                          "psrlq  $32,%%"#dividend              ";" \
                                          "psrlq  $32,%%"#divisor               ";" \
                                          xmm_pdivwX(dividend,divisor,scratch)      \
																																										  "psllq  $96,%%"#scratch             ";" \
                                            "por    %%"#scratch ",%%"#quotient

/* equivalent to the INT_MULT() macro in gimp-composite-generic.c */
/*
 * opr2 = INT_MULT(opr1, opr2, t)
 *
 * Operates across quad-words using x86 word (16bit) value.
 * Result is left in opr2
 *
 * opr1 = opr1 * opr2 + w128
 * opr2 = opr1
 * opr2 = ((opr2 >> 8) + opr1) >> 8
 */
#define pmulwX(opr1,opr2,w128) \
                  "\tpmullw    %%"#opr2", %%"#opr1"; " \
                  "\tpaddw     %%"#w128", %%"#opr1"; " \
                  "\tmovq      %%"#opr1", %%"#opr2"; " \
                  "\tpsrlw     $8,        %%"#opr2"; " \
                  "\tpaddw     %%"#opr1", %%"#opr2"; " \
                  "\tpsrlw     $8,        %%"#opr2"\n"

#define xmm_pmulwX(opr1,opr2,w128) \
                  "\tpmullw    %%"#opr2", %%"#opr1"; " \
                  "\tpaddw     %%"#w128", %%"#opr1"; " \
                  "\tmovdqu    %%"#opr1", %%"#opr2"; " \
                  "\tpsrlw     $8,        %%"#opr2"; " \
                  "\tpaddw     %%"#opr1", %%"#opr2"; " \
                  "\tpsrlw     $8,        %%"#opr2"\n"

typedef unsigned char  uint8;
typedef unsigned short uint16;
typedef unsigned long  uint32;
typedef unsigned long long uint64;
typedef struct { uint64 __uint64[2]; } uint128;

extern const guint32 rgba8_alpha_mask_64[2];
extern const guint32 rgba8_b1_64[2];
extern const guint32 rgba8_b255_64[2];
extern const guint32 rgba8_w1_64[2];
extern const guint32 rgba8_w2_64[2];
extern const guint32 rgba8_w128_64[2];
extern const guint32 rgba8_w256_64[2];
extern const guint32 rgba8_w255_64[2];

extern const guint32 va8_alpha_mask[2];
extern const guint32 va8_b255[2];
extern const guint32 va8_w1[2];
extern const guint32 va8_w255[2];
#endif /* __GNUC__ >= 3 */