Merged changes from trunk to your_branch:svn merge -r 5441:5455 https://subversion.assembla.com/svn/qmcdev/trunk

git-svn-id: https://subversion.assembla.com/svn/qmcdev/branches/OptBF@5458 e5b18d87-469d-4833-9cc0-8cdfa06e9491
This commit is contained in:
Jeremy McMinis 2012-03-24 21:10:38 +00:00
commit 2f9b51a9c2
156 changed files with 84700 additions and 41 deletions

View File

@ -457,11 +457,9 @@ INCLUDE_DIRECTORIES( ${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src)
# - if everything fails, do not use it
######################################################################
if(HAVE_EINSPLINE)
if(EINSPLINE_HOME)
SUBDIRS(src/einspline)
INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR}/include)
SET(QMC_UTIL_LIBS ${QMC_UTIL_LIBS} einspline)
endif()
else()
INCLUDE(${PROJECT_CMAKE}/FindEinspline.cmake)
if(EINSPLINE_FOUND)

48
config/RosaGNU.cmake Normal file
View File

@ -0,0 +1,48 @@
SET(CMAKE_SYSTEM_PROCESSOR "XK6")
#2011-12-06
set(CMAKE_C_COMPILER /opt/cray/xt-asyncpe/5.06/bin/cc)
set(CMAKE_CXX_COMPILER /opt/cray/xt-asyncpe/5.06/bin/CC)
set(GNU_OPTS "-DADD_ -DINLINE_ALL=inline")
set(GNU_FLAGS "-fopenmp -O3 -Drestrict=__restrict__ -finline-limit=1000 -fstrict-aliasing -funroll-all-loops -Wno-deprecated ")
#set(XT_FLAGS "-march=amdfam10 -msse3 -D_CRAYMPI")
#set(XT_FLAGS " -msse3 -D_CRAYMPI")
#interlogs bdver1 but without it better
set(XT_FLAGS "-march=bdver1 -msse3 -D_CRAYMPI")
set(CMAKE_CXX_FLAGS "${XT_FLAGS} ${GNU_FLAGS} -ftemplate-depth-60 ${GNU_OPTS}")
set(CMAKE_C_FLAGS "${XT_FLAGS} ${GNU_FLAGS} -std=c99")
SET(QMC_BUILD_STATIC 1)
SET(ENABLE_OPENMP 1)
SET(HAVE_MPI 1)
SET(HAVE_SSE 1)
SET(HAVE_SSE2 1)
SET(HAVE_SSE3 1)
SET(HAVE_SSSE3 1)
SET(USE_PREFETCH 1)
SET(PREFETCH_AHEAD 12)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_SHARED_LINKER_FLAGS "")
FOREACH(type SHARED_LIBRARY SHARED_MODULE EXE)
SET(CMAKE_${type}_LINK_STATIC_C_FLAGS "-Wl,-Bstatic")
SET(CMAKE_${type}_LINK_DYNAMIC_C_FLAGS "-static")
SET(CMAKE_${type}_LINK_STATIC_CXX_FLAGS "-Wl,-Bstatic")
SET(CMAKE_${type}_LINK_DYNAMIC_CXX_FLAGS "-static")
ENDFOREACH(type)
set(CMAKE_FIND_ROOT_PATH
/opt/cray/hdf5/1.8.7/gnu/46
/opt/fftw/3.3.0.0/interlagos
/sw/xk6/boost/1.44.0/cle4.0_gnu4.5.3
/users/jnkim/xk6/libxml2
/apps/rosa/boost/1.47/gnu_453
)
#set(EINSPLINE_HOME /ccs/proj/mat034/jnkim/share/einspline)
set(HAVE_EINSPLINE 1)
set(HAVE_EINSPLINE_EXT 0)
link_libraries(/usr/lib64/libz.a)

View File

View File

@ -1817,7 +1817,7 @@ namespace qmcplusplus {
}
int psiIndex(0);
for (int j=0; j<NumValenceOrbs; j++) {
cerr<<psiIndex<<" "<<i<<" "<<j<<endl;
//cerr<<psiIndex<<" "<<i<<" "<<j<<endl;
psi(i,psiIndex)=real(StorageValueVector[j]);
for (int n=0; n<OHMMS_DIM; n++)
dpsi(i,psiIndex)[n] = real(StorageGradVector[j][n]);

View File

@ -46,7 +46,6 @@ namespace qmcplusplus
BackflowBuilder::~BackflowBuilder()
{
delete myHandler;
}
bool BackflowBuilder::put(xmlNodePtr cur)

View File

@ -25,71 +25,71 @@ SET(HFILES
bspline_base_cuda.h
)
FOREACH(a ${HFILES})
#INSTALL(FILES ${EINSPLINE_HOME}/src/${a} DESTINATION ${PROJECT_BINARY_DIR}/include/einspline)
configure_file(${EINSPLINE_HOME}/src/${a} ${PROJECT_BINARY_DIR}/include/einspline/${a} COPYONLY)
ENDFOREACH()
#FOREACH(a ${HFILES})
# #INSTALL(FILES ${EINSPLINE_HOME}/src/${a} DESTINATION ${PROJECT_BINARY_DIR}/include/einspline)
# configure_file(${EINSPLINE_HOME}/src/${a} ${PROJECT_BINARY_DIR}/include/einspline/${a} COPYONLY)
#ENDFOREACH()
set(SRCS )
SET(SRCS ${SRCS}
${EINSPLINE_HOME}/src/bspline_create.c
${EINSPLINE_HOME}/src/bspline_data.c
${EINSPLINE_HOME}/src/multi_bspline_create.c
${EINSPLINE_HOME}/src/multi_nubspline_create.c
${EINSPLINE_HOME}/src/nubspline_create.c
${EINSPLINE_HOME}/src/nubasis.c
${EINSPLINE_HOME}/src/nugrid.c
bspline_create.c
bspline_data.c
multi_bspline_create.c
multi_nubspline_create.c
nubspline_create.c
nubasis.c
nugrid.c
)
IF(HAVE_SSE)
SET(SRCS ${SRCS}
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_s.c
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_c.c
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_s_cpp.cc
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_c_cpp.cc
multi_bspline_eval_sse_s.c
multi_bspline_eval_sse_c.c
multi_bspline_eval_sse_s_cpp.cc
multi_bspline_eval_sse_c_cpp.cc
)
else()
SET(SRCS ${SRCS}
${EINSPLINE_HOME}/src/multi_bspline_eval_std_s.c
${EINSPLINE_HOME}/src/multi_bspline_eval_std_c.c
${EINSPLINE_HOME}/src/multi_bspline_eval_std_s_cpp.cc
${EINSPLINE_HOME}/src/multi_bspline_eval_std_c_cpp.cc
multi_bspline_eval_std_s.c
multi_bspline_eval_std_c.c
multi_bspline_eval_std_s_cpp.cc
multi_bspline_eval_std_c_cpp.cc
)
endif()
IF(HAVE_SSE2)
SET(SRCS ${SRCS}
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_d.c
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_z.c
${EINSPLINE_HOME}/src/multi_nubspline_eval_sse_z.c
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_d_cpp.cc
${EINSPLINE_HOME}/src/multi_bspline_eval_sse_z_cpp.cc
${EINSPLINE_HOME}/src/multi_nubspline_eval_sse_z_cpp.cc
multi_bspline_eval_sse_d.c
multi_bspline_eval_sse_z.c
multi_nubspline_eval_sse_z.c
multi_bspline_eval_sse_d_cpp.cc
multi_bspline_eval_sse_z_cpp.cc
multi_nubspline_eval_sse_z_cpp.cc
)
else()
SET(SRCS ${SRCS}
${EINSPLINE_HOME}/src/multi_bspline_eval_std_d.c
${EINSPLINE_HOME}/src/multi_bspline_eval_std_z.c
${EINSPLINE_HOME}/src/multi_nubspline_eval_std_z.c
${EINSPLINE_HOME}/src/multi_bspline_eval_std_d_cpp.cc
${EINSPLINE_HOME}/src/multi_bspline_eval_std_z_cpp.cc
${EINSPLINE_HOME}/src/multi_nubspline_eval_std_z_cpp.cc
multi_bspline_eval_std_d.c
multi_bspline_eval_std_z.c
multi_nubspline_eval_std_z.c
multi_bspline_eval_std_d_cpp.cc
multi_bspline_eval_std_z_cpp.cc
multi_nubspline_eval_std_z_cpp.cc
)
endif()
if(HAVE_CUDA)
SET(SRCS ${SRCS}
${EINSPLINE_HOME}/src/multi_bspline_create_cuda.cu
${EINSPLINE_HOME}/src/bspline_create_cuda.cu
multi_bspline_create_cuda.cu
bspline_create_cuda.cu
)
CUDA_ADD_LIBRARY(einspline ${SRCS})
else()
ADD_LIBRARY(einspline ${SRCS})
endif()
ADD_EXECUTABLE(time_multi ${EINSPLINE_HOME}/src/time_multi_new.c)
ADD_EXECUTABLE(time_multi time_multi_new.c)
target_link_libraries(time_multi einspline)
#add_dependencies(time_multi ${PROJECT_BINARY_HOME}/include/einspline/bspline.h)

844
src/einspline/TestBspline.c Normal file
View File

@ -0,0 +1,844 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "bspline.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
double drand48();
void
Test_1d_s()
{
Ugrid grid;
grid.start = 1.0;
grid.end = 3.0;
grid.num = 11;
float data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
BCtype_s bc;
bc.lCode = DERIV2; bc.lVal = 10.0;
bc.rCode = DERIV2; bc.rVal = -10.0;
FILE *fout = fopen ("1dSpline.dat", "w");
UBspline_1d_s *spline = (UBspline_1d_s*) create_UBspline_1d_s (grid, bc, data);
for (double x=1.0; x<=3.00001; x+=0.001) {
float val, grad, lapl;
eval_UBspline_1d_s_vgl (spline, x, &val, &grad, &lapl);
fprintf (fout, "%1.5f %20.14f %20.14f %20.14f\n", x, val, grad, lapl);
}
fclose (fout);
}
void
Test_1d_d()
{
Ugrid grid;
grid.start = 1.0;
grid.end = 3.0;
grid.num = 1000;
// double data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
double data[10000];
for (int i=0; i<10000; i++)
data[i] = -2.0 + 4.0*drand48();
BCtype_d bc;
bc.lCode = DERIV1; bc.lVal = 10.0;
bc.rCode = DERIV2; bc.rVal = -10.0;
FILE *fout = fopen ("Spline_1d_d.dat", "w");
UBspline_1d_d *spline =
(UBspline_1d_d*) create_UBspline_1d_d (grid, bc, data);
for (double x=1.0; x<=3.00001; x+=0.001) {
double val, grad, lapl;
eval_UBspline_1d_d_vgl (spline, x, &val, &grad, &lapl);
fprintf (fout, "%1.5f %20.14f %20.14f %20.14f\n", x, val, grad, lapl);
}
fclose (fout);
}
void
Test_1d_d_antiperiodic()
{
Ugrid grid;
grid.start = 1.0;
grid.end = 3.0;
grid.num = 10;
// double data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
double data[10];
for (int i=0; i<10; i++)
data[i] = -2.0 + 4.0*drand48();
BCtype_d bc;
bc.lCode = ANTIPERIODIC;
FILE *fout = fopen ("Spline_1d_d_antiperiodic.dat", "w");
UBspline_1d_d *spline =
(UBspline_1d_d*) create_UBspline_1d_d (grid, bc, data);
for (double x=1.0; x<=5.00001; x+=0.001) {
double val, grad, lapl;
double xp = x;
double sign = 1.0;
while (xp >= grid.end) {
xp -= (grid.end-grid.start);
sign *= -1.0;
}
eval_UBspline_1d_d_vgl (spline, xp, &val, &grad, &lapl);
fprintf (fout, "%1.5f %20.14f %20.14f %20.14f\n", x, sign*val, sign*grad, sign*lapl);
}
double val, grad, lapl;
double x = grid.start + (grid.end-grid.start) * (double)1/(double)grid.num;
eval_UBspline_1d_d_vgl (spline, x, &val, &grad, &lapl);
fclose (fout);
}
void
Speed_1d_s()
{
Ugrid grid;
grid.start = 1.0;
grid.end = 3.0;
grid.num = 11;
float data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
BCtype_s bc;
bc.lCode = DERIV2; bc.lVal = 10.0;
bc.rCode = DERIV2; bc.rVal = -10.0;
UBspline_1d_s *spline = (UBspline_1d_s*) create_UBspline_1d_s (grid, bc, data);
float val, grad, lapl;
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<100000000; i++) {
double x = grid.start + 0.99999*drand48()*(grid.end-grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<100000000; i++) {
double x = grid.start + 0.99999*drand48()*(grid.end-grid.start);
eval_UBspline_1d_s_vgl (spline, x, &val, &grad, &lapl);
}
end = clock();
fprintf (stderr, "100,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_2d_s()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 30;
float *data = malloc (x_grid.num * y_grid.num * sizeof(float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
BCtype_s x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.lVal = 10.0;
x_bc.rCode = PERIODIC; x_bc.rVal = -10.0;
y_bc.lCode = PERIODIC; y_bc.lVal = 10.0;
y_bc.rCode = PERIODIC; y_bc.rVal = -10.0;
UBspline_2d_s *spline = (UBspline_2d_s*) create_UBspline_2d_s (x_grid, y_grid, x_bc, y_bc, data);
FILE *fout = fopen ("2dspline.dat", "w");
for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
float val, grad[2], hess[4];
eval_UBspline_2d_s_vgh (spline, x, y, &val, grad, hess);
fprintf (fout, "%20.14f ", val);
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=5;
int iy=7;
float exval = data[ix*y_grid.num+iy];
double x = x_grid.start + (double)ix * spline->x_grid.delta;
double y = y_grid.start + (double)iy * spline->y_grid.delta;
float spval, grad[2], hess[4];
eval_UBspline_2d_s_vgh (spline, x, y, &spval, grad, hess);
fprintf (stderr, "exval = %20.15f spval = %20.15f\n", exval, spval);
}
void
Speed_2d_s()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 300;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 300;
float *data = malloc (x_grid.num * y_grid.num * sizeof(float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
BCtype_s x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
UBspline_2d_s *spline = (UBspline_2d_s*) create_UBspline_2d_s (x_grid, y_grid, x_bc, y_bc, data);
float val, grad[2], hess[4];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
eval_UBspline_2d_s_vgh (spline, x, y, &val, grad, hess);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_2d_c()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 30;
complex_float *data = malloc (x_grid.num * y_grid.num * sizeof(complex_float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) =
-1.0 + 2.0*drand48() + 1.0fI*(-1.0 + 2.0*drand48());
BCtype_c x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
UBspline_2d_c *spline = (UBspline_2d_c*) create_UBspline_2d_c (x_grid, y_grid, x_bc, y_bc, data);
FILE *fout = fopen ("2dspline.dat", "w");
for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
complex_float val, grad[2], hess[4];
eval_UBspline_2d_c_vgh (spline, x, y, &val, grad, hess);
fprintf (fout, "%20.14f %20.15f ", crealf(val), cimagf(val));
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=5;
int iy=7;
complex_float exval = data[ix*y_grid.num+iy];
double x = x_grid.start + (double)ix * spline->x_grid.delta;
double y = y_grid.start + (double)iy * spline->y_grid.delta;
complex_float spval, grad[2], hess[4];
eval_UBspline_2d_c_vgh (spline, x, y, &spval, grad, hess);
fprintf (stderr, "exval = (%20.15f + %20.15fi) spval = (%20.15f + %20.15fi)\n",
crealf(exval), cimagf(exval), creal(spval), cimagf(spval));
}
void
Speed_2d_c()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 300;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 300;
complex_float *data = malloc (x_grid.num * y_grid.num * sizeof(complex_float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) =
-1.0 + 2.0*drand48() + 1.0fI*(-1.0 + 2.0*drand48());
BCtype_c x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
UBspline_2d_c *spline = (UBspline_2d_c*) create_UBspline_2d_c (x_grid, y_grid, x_bc, y_bc, data);
complex_float val, grad[2], hess[4];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
eval_UBspline_2d_c_vgh (spline, x, y, &val, grad, hess);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_2d_d()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 30;
double *data = malloc (x_grid.num * y_grid.num * sizeof(double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
BCtype_d x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
UBspline_2d_d *spline =
create_UBspline_2d_d (x_grid, y_grid, x_bc, y_bc, data);
FILE *fout = fopen ("2dspline.dat", "w");
for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
double val, grad[2], hess[4];
eval_UBspline_2d_d_vgh (spline, x, y, &val, grad, hess);
fprintf (fout, "%20.14f ", val);
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=5;
int iy=7;
double exval = data[ix*y_grid.num+iy];
double x = x_grid.start + (double)ix * spline->x_grid.delta;
double y = y_grid.start + (double)iy * spline->y_grid.delta;
double spval, grad[2], hess[4];
eval_UBspline_2d_d_vgh (spline, x, y, &spval, grad, hess);
fprintf (stderr, "exval = %20.15f spval = %20.15f\n", exval, spval);
}
void
Speed_2d_d()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 300;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 300;
double *data = malloc (x_grid.num * y_grid.num * sizeof(double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
BCtype_d x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
UBspline_2d_d *spline = (UBspline_2d_d*) create_UBspline_2d_d (x_grid, y_grid, x_bc, y_bc, data);
double val, grad[2], hess[4];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<100000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<100000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
eval_UBspline_2d_d_vgh (spline, x, y, &val, grad, hess);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_2d_z()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 30;
complex_double *data = malloc (x_grid.num * y_grid.num * sizeof(complex_double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) =
-1.0 + 2.0*drand48() + 1.0I*(-1.0 + 2.0*drand48());
BCtype_z x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
UBspline_2d_z *spline =
create_UBspline_2d_z (x_grid, y_grid, x_bc, y_bc, data);
FILE *fout = fopen ("2dspline.dat", "w");
for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
complex_double val, grad[2], hess[4];
eval_UBspline_2d_z_vgh (spline, x, y, &val, grad, hess);
fprintf (fout, "%20.14f %20.14f ", creal(val), cimag(val));
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=5;
int iy=7;
complex_double exval = data[ix*y_grid.num+iy];
double x = x_grid.start + (double)ix * spline->x_grid.delta;
double y = y_grid.start + (double)iy * spline->y_grid.delta;
complex_double spval, grad[2], hess[4];
eval_UBspline_2d_z_vgh (spline, x, y, &spval, grad, hess);
fprintf (stderr, "exval = (%20.15f + %20.15fi) spval = (%20.15f + %20.15fi)\n",
creal(exval), cimag(exval), creal(spval), cimag(spval));
}
void
Speed_2d_z()
{
Ugrid x_grid, y_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 300;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 300;
complex_double *data = malloc (x_grid.num * y_grid.num * sizeof(complex_double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
*(data + ix*y_grid.num + iy) =
-1.0 + 2.0*drand48() + 1.0I*(-1.0 + 2.0*drand48());
BCtype_z x_bc, y_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
UBspline_2d_z *spline = (UBspline_2d_z*) create_UBspline_2d_z (x_grid, y_grid, x_bc, y_bc, data);
complex_double val, grad[2], hess[4];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<100000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<100000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
eval_UBspline_2d_z_vgh (spline, x, y, &val, grad, hess);
}
end = clock();
fprintf (stderr, "100,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_3d_s()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.0001; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.0001; y_grid.num = 30;
z_grid.start = 1.0; z_grid.end = 3.0001; z_grid.num = 30;
float *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
BCtype_s x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_s *spline = (UBspline_3d_s*) create_UBspline_3d_s
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
double z = 1.92341;
FILE *fout = fopen ("3dspline.dat", "w");
for (double x=x_grid.start; x<x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<y_grid.end; y+=0.005) {
float val, grad[3], hess[9], lapl;
eval_UBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
fprintf (fout, "%20.14f ", val);
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=9; int iy=19; int iz = 24;
float exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
double x = x_grid.start + (double)ix * spline->x_grid.delta + 0.000001;
double y = y_grid.start + (double)iy * spline->y_grid.delta + 0.000001;
z = z_grid.start + (double)iz * spline->z_grid.delta + 0.000001;
float spval, grad[3], hess[9], lapl;
eval_UBspline_3d_s_vgh (spline, x, y, z, &spval, grad, hess);
fprintf (stderr, "exval = %20.15f spval = %20.15f\n", exval, spval);
}
void
Speed_3d_s()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 200;
y_grid.start = 1.0; y_grid.end = 5.0; y_grid.num = 200;
z_grid.start = 1.0; z_grid.end = 7.0; z_grid.num = 200;
float *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
BCtype_s x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_s *spline = (UBspline_3d_s*) create_UBspline_3d_s
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
float val, grad[3], hess[9];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
eval_UBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_3d_d()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 30;
z_grid.start = 1.0; z_grid.end = 3.0; z_grid.num = 30;
double *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
BCtype_d x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_d *spline = (UBspline_3d_d*) create_UBspline_3d_d
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
double z = 1.92341;
FILE *fout = fopen ("3dspline.dat", "w");
for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
double val, grad[3], hess[9];
eval_UBspline_3d_d_vgh (spline, x, y, z, &val, grad, hess);
fprintf (fout, "%23.17f ", val);
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=9; int iy=19; int iz = 24;
double exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
double x = x_grid.start + (double)ix * spline->x_grid.delta;
double y = y_grid.start + (double)iy * spline->y_grid.delta;
z = z_grid.start + (double)iz * spline->z_grid.delta;
double spval, grad[3], hess[9];
eval_UBspline_3d_d_vgh (spline, x, y, z, &spval, grad, hess);
fprintf (stderr, "exval = %23.17f spval = %23.17f\n", exval, spval);
}
void
Speed_3d_d()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 200;
y_grid.start = 1.0; y_grid.end = 5.0; y_grid.num = 200;
z_grid.start = 1.0; z_grid.end = 7.0; z_grid.num = 200;
double *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
BCtype_d x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_d *spline = (UBspline_3d_d*) create_UBspline_3d_d
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
double val, grad[3], hess[9];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
eval_UBspline_3d_d_vgh (spline, x, y, z, &val, grad, hess);
// eval_UBspline_3d_d (spline, x, y, z, &val);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_3d_c()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.0; y_grid.num = 30;
z_grid.start = 1.0; z_grid.end = 3.0; z_grid.num = 30;
complex_float *data =
malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) =
(-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
BCtype_c x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_c *spline = create_UBspline_3d_c
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
double z = 1.92341;
FILE *fout = fopen ("3dspline.dat", "w");
for (double x=x_grid.start; x<0.99999*x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<y_grid.end; y+=0.005) {
complex_float val, grad[3], hess[9];
eval_UBspline_3d_c_vgh (spline, x, y, z, &val, grad, hess);
fprintf (fout, "%23.17f %23.17f ", crealf(val), cimagf(val));
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=9; int iy=18; int iz = 24;
complex_float exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
double x = x_grid.start + (double)ix * spline->x_grid.delta;
double y = y_grid.start + (double)iy * spline->y_grid.delta;
z = z_grid.start + (double)iz * spline->z_grid.delta;
complex_float spval, grad[3], hess[9];
eval_UBspline_3d_c_vgh (spline, x, y, z, &spval, grad, hess);
fprintf (stderr, "exval = (%23.17f + %23.17fi)\nspval = (%23.17f + %23.17fi)\n",
crealf(exval), cimagf(exval), crealf(spval), cimagf(spval));
}
void
Speed_3d_c()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 200;
y_grid.start = 1.0; y_grid.end = 5.0; y_grid.num = 200;
z_grid.start = 1.0; z_grid.end = 7.0; z_grid.num = 200;
complex_float *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_float));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) =
(-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
BCtype_c x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_c *spline = (UBspline_3d_c*) create_UBspline_3d_c
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
complex_float val, grad[3], hess[9];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
eval_UBspline_3d_c_vgh (spline, x, y, z, &val, grad, hess);
//eval_UBspline_3d_c (spline, x, y, z, &val);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
Test_3d_z()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.4; x_grid.num = 30;
y_grid.start = 1.0; y_grid.end = 3.7; y_grid.num = 30;
z_grid.start = 1.0; z_grid.end = 3.9; z_grid.num = 30;
complex_double *data =
malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) =
(-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
BCtype_z x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_z *spline = create_UBspline_3d_z
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
double z = 1.92341;
FILE *fout = fopen ("3dspline.dat", "w");
for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
complex_double val, grad[3], hess[9];
eval_UBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
fprintf (fout, "%23.19f %23.19f ", crealf(hess[4]), cimagf(hess[4]));
}
fprintf (fout, "\n");
}
fclose (fout);
int ix=9; int iy=19; int iz = 25;
complex_double exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
double x = x_grid.start + (double)ix * spline->x_grid.delta;
double y = y_grid.start + (double)iy * spline->y_grid.delta;
z = z_grid.start + (double)iz * spline->z_grid.delta;
complex_double spval, grad[3], hess[9];
eval_UBspline_3d_z_vgh (spline, x, y, z, &spval, grad, hess);
fprintf (stderr, "exval = (%23.19f + %23.19fi)\nspval = (%23.17f + %23.17fi)\n",
crealf(exval), cimagf(exval), crealf(spval), cimagf(spval));
}
void
Speed_3d_z()
{
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 1.0; x_grid.end = 3.0; x_grid.num = 200;
y_grid.start = 1.0; y_grid.end = 5.0; y_grid.num = 200;
z_grid.start = 1.0; z_grid.end = 7.0; z_grid.num = 200;
complex_double *data =
malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_double));
for (int ix=0; ix<x_grid.num; ix++)
for (int iy=0; iy<y_grid.num; iy++)
for (int iz=0; iz<z_grid.num; iz++)
*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) =
(-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
BCtype_z x_bc, y_bc, z_bc;
x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC;
y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC;
UBspline_3d_z *spline = (UBspline_3d_z*) create_UBspline_3d_z
(x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data);
complex_double val, grad[3], hess[9];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
eval_UBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
#ifdef F77_DUMMY_MAIN
# ifdef __cplusplus
extern "C"
# endif
int F77_DUMMY_MAIN() { return 1; }
#endif
int main()
{
Test_1d_s();
Test_1d_d();
Test_1d_d_antiperiodic();
// Speed_1d_s();
Test_2d_s();
// Speed_2d_s();
Test_2d_c();
// Speed_2d_c();
Test_2d_d();
// Speed_2d_d();
Test_2d_z();
// Speed_2d_z();
Test_3d_s();
// Speed_3d_s();
Test_3d_d();
// Speed_3d_d();
Test_3d_c();
// Speed_3d_c();
Test_3d_z();
Speed_3d_z();
}

View File

@ -0,0 +1,686 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "nubspline.h"
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#ifndef M_PI
#define M_PI 3.1415926535897932384626433
#endif
double drand48();
void
PrintPassFail(bool pass)
{
if (pass)
// Print green "Passed"
fprintf (stderr, "%c[32mPassed%c[0m\n", 0x1B, 0x1B);
else
// Print red "Failed"
fprintf (stderr, "%c[31mFailed%c[0m\n", 0x1B, 0x1B);
}
void PrintTest (char *name, bool pass)
{
int n = strlen (name);
fprintf (stderr, "%s:", name);
for (int i=n; i<57; i++)
fprintf (stderr, " ");
PrintPassFail (pass);
}
bool
TestCenterGrid()
{
fprintf (stderr, "Testing CenterGrid: ");
bool passed = true;
NUgrid* grid = create_center_grid (-5.0, 7.0, 6.0, 200);
for (int i=0; i<10000; i++) {
double x = -5.0+12.0*drand48();
int lo = (*grid->reverse_map)(grid, x);
assert (x >= grid->points[lo]);
assert (x <= grid->points[lo+1]);
}
PrintPassFail (passed);
return passed;
}
bool
TestGeneralGrid()
{
fprintf (stderr, "Testing GeneralGrid: ");
bool passed = true;
NUgrid* centgrid = create_center_grid (-5.0, 7.0, 6.0, 200);
NUgrid* grid = create_general_grid (centgrid->points, 200);
for (int i=0; i<10000; i++) {
double x = -5.0+12.0*drand48();
int lo = (*grid->reverse_map)(grid, x);
passed = passed && (x >= grid->points[lo]);
passed = passed && (x <= grid->points[lo+1]);
}
PrintPassFail (passed);
return passed;
}
bool
close_float (float x, float y)
{
float max = fmaxf (x, y);
return (fabs(x-y)/max < 1.0e-5);
}
bool
TestNUB_1d_s()
{
double start = -5.0;
double end = 7.0;
int N = 200;
NUgrid* grid = create_center_grid (start, end, 6.0, N);
bool passed = true;
float data[N];
for (int i=0; i<N; i++)
data[i] = -1.0 + 2.0*drand48();
BCtype_s bc;
// Create spline with PBC
fprintf (stderr, "Testing 1D single-precision periodic boundary conditions:\n");
bc.lCode = PERIODIC; bc.rCode = PERIODIC;
NUBspline_1d_s *periodic = create_NUBspline_1d_s (grid, bc, data);
float sval, sgrad, slapl, eval, egrad, elapl;
eval_NUBspline_1d_s_vgl (periodic, start, &sval, &sgrad, &slapl);
eval_NUBspline_1d_s_vgl (periodic, end , &eval, &egrad, &elapl);
bool v_passed, grad_passed, lapl_passed;
v_passed = close_float (sval, eval);
grad_passed = close_float (sgrad, egrad);
lapl_passed = close_float (slapl, elapl);
PrintTest ("Value", v_passed);
PrintTest ("First derivative", grad_passed);
PrintTest ("Second derivative", lapl_passed);
passed = passed && v_passed && grad_passed && lapl_passed;
double x = grid->points[26];
float val;
eval_NUBspline_1d_s (periodic, x, &val);
bool interp_passed = close_float (val, data[26]);
PrintTest ("Interpolation", interp_passed);
passed = passed && interp_passed;
// Create spline with fixed first derivative:
bc.lCode = DERIV1; bc.lVal = 1.5;
bc.rCode = DERIV1; bc.rVal = -0.3;
NUBspline_1d_s *fixed_first = create_NUBspline_1d_s (grid, bc, data);
fprintf (stderr, "Testing 1D single-precsion fixed first derivative boundary conditions: \n");
eval_NUBspline_1d_s_vg (fixed_first, start, &sval, &sgrad);
eval_NUBspline_1d_s_vg (fixed_first, end, &eval, &egrad);
bool bc_passed = close_float (sgrad, 1.5) && close_float (egrad, -0.3);
PrintTest ("Boundary conditions", bc_passed);
x = grid->points[26];
eval_NUBspline_1d_s (periodic, x, &val);
interp_passed = close_float (val, data[26]);
PrintTest ("Interpolation", interp_passed);
passed = passed && interp_passed && bc_passed;
// Create spline with fixed second derivative:
bc.lCode = DERIV2; bc.lVal = 1.5;
bc.rCode = DERIV2; bc.rVal = -0.3;
NUBspline_1d_s *fixed_second = create_NUBspline_1d_s (grid, bc, data);
fprintf (stderr, "Testing 1d_s fixed second derivative boundary conditions: \n");
eval_NUBspline_1d_s_vgl (fixed_second, start, &sval, &sgrad, &slapl);
eval_NUBspline_1d_s_vgl (fixed_second, end, &eval, &egrad, &elapl);
bc_passed = close_float (slapl, 1.5) && close_float (elapl, -0.3);
fprintf (stderr, "slapl = %1.8f elapl = %1.8f\n", slapl, elapl);
PrintTest ("Boundary conditions", bc_passed);
x = grid->points[26];
eval_NUBspline_1d_s (periodic, x, &val);
interp_passed = close_float (val, data[26]);
PrintTest ("Interpolation", interp_passed);
passed = passed && interp_passed && bc_passed;
return passed;
}
void
GridSpeedTest()
{
NUgrid* centgrid = create_center_grid (-5.0, 7.0, 6.0, 2000);
NUgrid* gengrid = create_general_grid (centgrid->points, 2000);
int centsum=0, gensum=0;
clock_t rstart, rend, cstart, cend, gstart, gend;
rstart = clock();
for (int i=0; i<100000000; i++) {
double x = -5.0 + 12.0*drand48();
}
rend = clock();
cstart = clock();
for (int i=0; i<100000000; i++) {
double x = -5.0 + 12.0*drand48();
centsum += (*centgrid->reverse_map)(centgrid, x);
}
cend = clock();
gstart = clock();
for (int i=0; i<100000000; i++) {
double x = -5.0 + 12.0*drand48();
gensum += (*gengrid->reverse_map)(gengrid, x);
}
gend = clock();
double cent_time = (double)(cend-cstart+rstart-rend)/(double)CLOCKS_PER_SEC;
double gen_time = (double)(gend-gstart+rstart-rend)/(double)CLOCKS_PER_SEC;
fprintf (stderr, "%d %d\n", centsum, gensum);
fprintf (stderr, "center_grid time = %1.3f s.\n", cent_time);
fprintf (stderr, "general_grid time = %1.3f s.\n", gen_time);
}
void
TestNUBasis()
{
NUgrid* centgrid = create_center_grid (-5.0, 7.0, 10.0, 20);
NUBasis* basis = create_NUBasis (centgrid, true);
double bfuncs[4];
for (double x=-5.0; x<=7.0; x+=0.001) {
get_NUBasis_funcs_d (basis, x, bfuncs);
fprintf (stderr, "%1.12f %1.12f %1.12f %1.12f %1.12f\n",
x, bfuncs[0], bfuncs[1], bfuncs[2], bfuncs[3]);
}
}
void
TestNUBspline()
{
NUgrid* centgrid = create_center_grid (-5.0, 7.0, 10.0, 20);
NUBasis* basis = create_NUBasis (centgrid, true);
float data[20];
for (int i=0; i<20; i++) {
double x = centgrid->points[i];
double angle = (x+5.0)/12.0 * 2.0*M_PI;
data[i] = sin(angle);
}
BCtype_s bc;
// bc.lCode = PERIODIC; bc.rCode = PERIODIC;
bc.lCode = DERIV1; bc.lVal = 2.0*M_PI/12.0;
bc.rCode = DERIV1; bc.rVal = 2.0*M_PI/12.0;
//bc.lCode = NATURAL; bc.rCode = FLAT;
NUBspline_1d_s *spline = create_NUBspline_1d_s (centgrid, bc, data);
for (double x=-5.0; x<=7.0; x+=0.001) {
float val, deriv;
eval_NUBspline_1d_s_vg (spline, x, &val, &deriv);
double angle = (x+5.0)/12.0 * 2.0*M_PI;
fprintf (stderr, "%1.16e %1.16e %1.16e %1.16e\n", x, val,
sin(angle), deriv);
}
}
void
TestNUBspline_d()
{
NUgrid* centgrid = create_center_grid (-5.0, 7.0, 10.0, 20);
NUBasis* basis = create_NUBasis (centgrid, true);
double data[20];
for (int i=0; i<20; i++) {
double x = centgrid->points[i];
double angle = (x+5.0)/12.0 * 2.0*M_PI;
data[i] = sin(angle);
}
BCtype_d bc;
// bc.lCode = PERIODIC; bc.rCode = PERIODIC;
bc.lCode = DERIV1; bc.lVal = 2.0*M_PI/12.0;
bc.rCode = DERIV1; bc.rVal = 2.0*M_PI/12.0;
//bc.lCode = NATURAL; bc.rCode = FLAT;
NUBspline_1d_d *spline = create_NUBspline_1d_d (centgrid, bc, data);
for (double x=-5.0; x<=7.0; x+=0.001) {
double val, deriv;
eval_NUBspline_1d_d_vg (spline, x, &val, &deriv);
double angle = (x+5.0)/12.0 * 2.0*M_PI;
fprintf (stderr, "%1.16e %1.16e %1.16e %1.16e\n", x, val,
sin(angle), deriv);
}
}
void
TestNUB_2d_s()
{
int Mx=30, My=35;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
float data[Mx*My];
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
data[ix*My+iy] = -1.0+2.0*drand48();
BCtype_s xBC, yBC;
xBC.lCode = PERIODIC;
yBC.lCode = PERIODIC;
// xBC.lCode = FLAT; xBC.rCode = FLAT;
// yBC.lCode = FLAT; yBC.rCode = FLAT;
NUBspline_2d_s *spline = create_NUBspline_2d_s (x_grid, y_grid, xBC, yBC, data);
int xFine = 400;
int yFine = 400;
FILE *fout = fopen ("2d_s.dat", "w");
double xi = x_grid->start;
double xf = x_grid->end;// + x_grid->points[1] - x_grid->points[0];
double yi = y_grid->start;
double yf = y_grid->end;// + y_grid->points[1] - y_grid->points[0];
for (int ix=0; ix<xFine; ix++) {
double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
for (int iy=0; iy<yFine; iy++) {
double y = yi + (double)iy/(double)(yFine)*(yf-yi);
float val;
eval_NUBspline_2d_s (spline, x, y, &val);
fprintf (fout, "%1.16e ", val);
}
fprintf (fout, "\n");
}
fclose (fout);
}
void
TestNUB_2d_c()
{
int Mx=30, My=35;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
complex_float data[Mx*My];
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
data[ix*My+iy] = -1.0+2.0*drand48() + 1.0fi*(-1.0+2.0*drand48());
BCtype_c xBC, yBC;
xBC.lCode = PERIODIC;
yBC.lCode = PERIODIC;
// xBC.lCode = FLAT; xBC.rCode = FLAT;
// yBC.lCode = FLAT; yBC.rCode = FLAT;
NUBspline_2d_c *spline = create_NUBspline_2d_c (x_grid, y_grid, xBC, yBC, data);
int xFine = 400;
int yFine = 400;
FILE *rout = fopen ("2d_r.dat", "w");
FILE *iout = fopen ("2d_i.dat", "w");
double xi = x_grid->start;
double xf = x_grid->end;// + x_grid->points[1] - x_grid->points[0];
double yi = y_grid->start;
double yf = y_grid->end;// + y_grid->points[1] - y_grid->points[0];
for (int ix=0; ix<xFine; ix++) {
double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
for (int iy=0; iy<yFine; iy++) {
double y = yi + (double)iy/(double)(yFine)*(yf-yi);
complex_float val, grad[2], hess[4];
eval_NUBspline_2d_c_vgh (spline, x, y, &val, grad, hess);
fprintf (rout, "%1.16e ", crealf(val));
fprintf (iout, "%1.16e ", cimagf(val));
}
fprintf (rout, "\n");
fprintf (iout, "\n");
}
fclose (rout);
fclose (iout);
}
void
TestNUB_3d_s()
{
int Mx=20, My=27, Mz=23;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
NUgrid *z_grid = create_center_grid (-1.8, 2.0, 2.8, Mz);
float data[Mx*My*Mz];
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
for (int iz=0; iz<Mz; iz++)
data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48();
BCtype_s xBC, yBC, zBC;
// xBC.lCode = PERIODIC;
// yBC.lCode = PERIODIC;
xBC.lCode = PERIODIC; xBC.rCode = PERIODIC;
yBC.lCode = PERIODIC; yBC.rCode = PERIODIC;
zBC.lCode = PERIODIC; zBC.rCode = PERIODIC;
NUBspline_3d_s *spline = create_NUBspline_3d_s (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
int xFine = 200, yFine = 200, zFine=200;
FILE *fout = fopen ("3d_s.dat", "w");
double xi = x_grid->start; double xf = x_grid->end;
double yi = y_grid->start; double yf = y_grid->end;
double zi = z_grid->start; double zf = z_grid->end;
for (int ix=0; ix<xFine; ix++) {
double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
for (int iy=0; iy<yFine; iy++) {
double y = yi + (double)iy/(double)(yFine)*(yf-yi);
for (int iz=0; iz<zFine; iz++) {
double z = zi + (double)iz/(double)(zFine)*(zf-zi);
float val, grad[3], hess[9];
eval_NUBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
fprintf (fout, "%1.16e ", val);
}
}
fprintf (fout, "\n");
}
fclose (fout);
fprintf (stderr, "spline->sp_code = %d\n", spline->sp_code);
destroy_Bspline (spline);
}
void
TestNUB_3d_d()
{
int Mx=20, My=27, Mz=23;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
NUgrid *z_grid = create_center_grid (-1.8, 2.0, 2.8, Mz);
double data[Mx*My*Mz];
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
for (int iz=0; iz<Mz; iz++)
data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48();
BCtype_d xBC, yBC, zBC;
// xBC.lCode = PERIODIC;
// yBC.lCode = PERIODIC;
xBC.lCode = PERIODIC; xBC.rCode = PERIODIC;
yBC.lCode = PERIODIC; yBC.rCode = PERIODIC;
zBC.lCode = PERIODIC; zBC.rCode = PERIODIC;
NUBspline_3d_d *spline = create_NUBspline_3d_d (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
int xFine = 200, yFine = 200, zFine=200;
FILE *fout = fopen ("3d_d.dat", "w");
double xi = x_grid->start; double xf = x_grid->end;
double yi = y_grid->start; double yf = y_grid->end;
double zi = z_grid->start; double zf = z_grid->end;
for (int ix=0; ix<xFine; ix++) {
double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
for (int iy=0; iy<yFine; iy++) {
double y = yi + (double)iy/(double)(yFine)*(yf-yi);
for (int iz=0; iz<zFine; iz++) {
double z = zi + (double)iz/(double)(zFine)*(zf-zi);
double val, grad[3], hess[9];
eval_NUBspline_3d_d_vgh (spline, x, y, z, &val, grad, hess);
fprintf (fout, "%1.16e ", val);
}
}
fprintf (fout, "\n");
}
fclose (fout);
fprintf (stderr, "spline->sp_code = %d\n", spline->sp_code);
destroy_Bspline (spline);
}
void
TestNUB_3d_c()
{
int Mx=20, My=27, Mz=23;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
NUgrid *z_grid = create_center_grid (-1.8, 2.0, 2.8, Mz);
complex_float data[Mx*My*Mz];
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
for (int iz=0; iz<Mz; iz++)
data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48() + 1.0if*(-1.0+2.0*drand48());
BCtype_c xBC, yBC, zBC;
// xBC.lCode = PERIODIC;
// yBC.lCode = PERIODIC;
xBC.lCode = PERIODIC; xBC.rCode = PERIODIC;
yBC.lCode = PERIODIC; yBC.rCode = PERIODIC;
zBC.lCode = PERIODIC; zBC.rCode = PERIODIC;
NUBspline_3d_c *spline = create_NUBspline_3d_c (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
int xFine = 200, yFine = 200, zFine=200;
FILE *rout = fopen ("3d_r.dat", "w");
FILE *iout = fopen ("3d_i.dat", "w");
double xi = x_grid->start; double xf = x_grid->end;
double yi = y_grid->start; double yf = y_grid->end;
double zi = z_grid->start; double zf = z_grid->end;
for (int ix=0; ix<xFine; ix++) {
double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
for (int iy=0; iy<yFine; iy++) {
double y = yi + (double)iy/(double)(yFine)*(yf-yi);
for (int iz=0; iz<zFine; iz++) {
double z = zi + (double)iz/(double)(zFine)*(zf-zi);
complex_float val, grad[3], hess[9];
eval_NUBspline_3d_c_vgh (spline, x, y, z, &val, grad, hess);
fprintf (rout, "%1.16e ", crealf(val));
fprintf (iout, "%1.16e ", cimagf(val));
}
}
fprintf (rout, "\n");
fprintf (iout, "\n");
}
fclose (rout);
fclose (iout);
}
void
TestNUB_3d_z()
{
int Mx=20, My=27, Mz=23;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
NUgrid *z_grid = create_center_grid (-1.8, 2.0, 2.8, Mz);
complex_double data[Mx*My*Mz];
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
for (int iz=0; iz<Mz; iz++)
data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48() + 1.0if*(-1.0+2.0*drand48());
BCtype_z xBC, yBC, zBC;
// xBC.lCode = PERIODIC;
// yBC.lCode = PERIODIC;
xBC.lCode = PERIODIC; xBC.rCode = PERIODIC;
yBC.lCode = PERIODIC; yBC.rCode = PERIODIC;
zBC.lCode = PERIODIC; zBC.rCode = PERIODIC;
NUBspline_3d_z *spline = create_NUBspline_3d_z (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
int xFine = 200, yFine = 200, zFine=200;
FILE *rout = fopen ("3d_r.dat", "w");
FILE *iout = fopen ("3d_i.dat", "w");
double xi = x_grid->start; double xf = x_grid->end;
double yi = y_grid->start; double yf = y_grid->end;
double zi = z_grid->start; double zf = z_grid->end;
for (int ix=0; ix<xFine; ix++) {
double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
for (int iy=0; iy<yFine; iy++) {
double y = yi + (double)iy/(double)(yFine)*(yf-yi);
for (int iz=0; iz<zFine; iz++) {
double z = zi + (double)iz/(double)(zFine)*(zf-zi);
complex_double val, grad[3], hess[9];
eval_NUBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
fprintf (rout, "%1.16e ", crealf(val));
fprintf (iout, "%1.16e ", cimagf(val));
}
}
fprintf (rout, "\n");
fprintf (iout, "\n");
}
fclose (rout);
fclose (iout);
}
void
SpeedNUB_3d_s()
{
int Mx=200, My=200, Mz=200;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 1.0001, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 1.0001, My);
NUgrid *z_grid = create_center_grid (-1.8, 2.0, 1.0001, Mz);
float *data;
data = malloc (sizeof(float)*Mx*My*Mz);
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
for (int iz=0; iz<Mz; iz++)
data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48();
BCtype_s xBC, yBC, zBC;
// xBC.lCode = PERIODIC;
// yBC.lCode = PERIODIC;
xBC.lCode = PERIODIC; xBC.rCode = PERIODIC;
yBC.lCode = PERIODIC; yBC.rCode = PERIODIC;
zBC.lCode = PERIODIC; zBC.rCode = PERIODIC;
NUBspline_3d_s *spline = create_NUBspline_3d_s (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
float val, grad[3], hess[9];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
eval_NUBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
SpeedNUB_3d_z()
{
int Mx=200, My=200, Mz=200;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
NUgrid *z_grid = create_center_grid (-1.8, 2.0, 2.8, Mz);
complex_double *data = malloc (sizeof(complex_double)*Mx*My*Mz);
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
for (int iz=0; iz<Mz; iz++)
data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48() + 1.0if*(-1.0+2.0*drand48());
BCtype_z xBC, yBC, zBC;
xBC.lCode = PERIODIC; xBC.rCode = PERIODIC;
yBC.lCode = PERIODIC; yBC.rCode = PERIODIC;
zBC.lCode = PERIODIC; zBC.rCode = PERIODIC;
NUBspline_3d_z *spline = create_NUBspline_3d_z (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
complex_double val, grad[3], hess[9];
clock_t start, end, rstart, rend;
rstart = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
}
rend = clock();
start = clock();
for (int i=0; i<10000000; i++) {
double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
eval_NUBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
}
end = clock();
fprintf (stderr, "10,000,000 evalations in %f seconds.\n",
(double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
}
void
TestNUB_2d_d()
{
int Mx=30, My=35;
NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
double data[Mx*My];
for (int ix=0; ix<Mx; ix++)
for (int iy=0; iy<My; iy++)
data[ix*My+iy] = -1.0+2.0*drand48();
BCtype_d xBC, yBC;
xBC.lCode = PERIODIC;
yBC.lCode = PERIODIC;
// xBC.lCode = FLAT; xBC.rCode = FLAT;
// yBC.lCode = FLAT; yBC.rCode = FLAT;
NUBspline_2d_d *spline = create_NUBspline_2d_d (x_grid, y_grid, xBC, yBC, data);
int xFine = 400;
int yFine = 400;
FILE *fout = fopen ("2d_d.dat", "w");
double xi = x_grid->start;
double xf = x_grid->end;// + x_grid->points[1] - x_grid->points[0];
double yi = y_grid->start;
double yf = y_grid->end;// + y_grid->points[1] - y_grid->points[0];
for (int ix=0; ix<xFine; ix++) {
double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
for (int iy=0; iy<yFine; iy++) {
double y = yi + (double)iy/(double)(yFine)*(yf-yi);
double val;
eval_NUBspline_2d_d (spline, x, y, &val);
fprintf (fout, "%1.16e ", val);
}
fprintf (fout, "\n");
}
fclose (fout);
}
int main()
{
// TestCenterGrid();
// TestGeneralGrid();
// GridSpeedTest();
// TestNUBasis();
// TestNUBasis();
TestNUBspline_d();
// TestNUB_2d_s();
// TestNUB_2d_c();
// TestNUB_3d_c();
// SpeedNUB_3d_s();
// TestNUB_2d_d();
// TestNUB_3d_d();
// TestNUB_3d_z();
//SpeedNUB_3d_z();
// bool passed = TestNUB_1d_s();
}

View File

@ -0,0 +1,49 @@
#ifndef ALIGNED_ALLOC_H
#define ALIGNED_ALLOC_H
#include <stdlib.h>
#include "config.h"
#ifdef HAVE_POSIX_MEMALIGN
inline void *
aligned_alloc (size_t size, size_t alignment)
{
void *ptr;
posix_memalign (&ptr, alignment, size);
return ptr;
}
inline void
aligned_free (void *ptr)
{
free (ptr);
}
#else
inline void *
aligned_alloc (size_t size, size_t alignment)
{
size += (alignment-1)+sizeof(void*);
void *ptr = malloc (size);
if (ptr == NULL)
return NULL;
else {
void *shifted = ptr + sizeof(void*);
size_t offset = alignment - (size_t)shifted%(size_t)alignment;
void *aligned = shifted + offset;
*((void**)aligned-1) = ptr;
return aligned;
}
}
inline void
aligned_free (void *aligned)
{
void *ptr = *((void**)aligned-1);
free (ptr);
}
#endif
#endif

176
src/einspline/blip_create.c Normal file
View File

@ -0,0 +1,176 @@
#include "blip_create.h"
#include <math.h>
#include <complex.h>
#include <fftw3.h>
#include "config.h"
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#define _XOPEN_SOURCE 600
#include <stdlib.h>
#include <math.h>
#include <aligned_alloc.h>
void init_sse_data();
inline
void* FFTAlign (void* ptr)
{
size_t offset = 16 - (size_t)((size_t)ptr)&0x0f;
return (void*) ((size_t)ptr+offset);
}
inline double dot (double a[3], double b[3])
{
return (a[0]*b[0] + a[1]*b[1] + a[2]*b[2]);
}
// This function creates a single-precision real blip function from a
// set of plane-wave coefficients. lattice is a 3x3 array specifying
// the lattice vectors. The first lattice vector is given
// contiguously at latice[0], the second at lattice[3], and the third
// at lattice[6]. The next is a list of 3D G-vectors in the format:
// G_x[0] G_y[0] G_z[0], G_x[1], G_y[1], G_z[1],...
// Next, complex plane-wave coefficents are given, one for each
// G-vector. Next, the number of G-vectors is given, followed by
// a factor which increases the density of the real-space grid. A
// factor of 1.0 uses the minimum density to avoid aliasing. Finally,
// the last parameter specifies whether to take the real or imaginary part.
// The spline is constructed to have domain [0,1) for x, y, and z coordinates.
UBspline_3d_s*
create_blip_3d_s (double *lattice, double *Gvecs,
complex_float *coefs, int numG,
double factor, bool useReal)
{
int max_ix=0, max_iy=0, max_iz=0;
int Mx, My, Mz;
double twoPiInv = 1.0/(2.0*M_PI);
for (int i=0; i<numG; i++) {
double *G = Gvecs+3*i;
int ix = round (twoPiInv * dot (lattice+0, G));
int iy = round (twoPiInv * dot (lattice+3, G));
int iz = round (twoPiInv * dot (lattice+6, G));
if (abs(ix) > max_ix) max_ix = ix;
if (abs(iy) > max_iy) max_iy = iy;
if (abs(iz) > max_iz) max_iz = iz;
}
Mx = 4*max_ix + 1;
My = 4*max_iy + 1;
Mz = 4*max_iz + 1;
Mx = (int) ceil(factor*Mx);
My = (int) ceil(factor*My);
Mz = (int) ceil(factor*Mz);
// FFTs are a little faster with even dimensions.
if ((Mx%2)==1) Mx++;
if ((My%2)==1) My++;
if ((Mz%2)==1) Mz++;
fprintf (stderr, "(Mx, My, Mz) = (%d, %d, %d)\n", Mx, My, Mz);
// Now allocate space for FFT box
complex_float *fft_box, *alloc_ptr;
fft_box = aligned_alloc (sizeof(complex_float)*Mx*My*Mz, 16);
// Create FFTW plan
fftwf_plan plan =
fftwf_plan_dft_3d (Mx, My, Mz, (fftwf_complex*)fft_box, (fftwf_complex*)fft_box, 1,
FFTW_ESTIMATE);
// Zero-out fft-box
for (int i=0; i<Mx*My*Mz; i++)
fft_box[i] = (complex_float)0.0f;
// Now fill in fft box with coefficients in the right places
double MxInv = 1.0/(double)Mx;
double MyInv = 1.0/(double)My;
double MzInv = 1.0/(double)Mz;
double scale = 1.0/3.375;
for (int i=0; i<numG; i++) {
double *g = Gvecs+3*i;
double G[3];
G[0] = MxInv*(lattice[0]*g[0] + lattice[3]*g[1] + lattice[6]*g[2]);
G[1] = MyInv*(lattice[1]*g[0] + lattice[4]*g[1] + lattice[7]*g[2]);
G[2] = MzInv*(lattice[2]*g[0] + lattice[5]*g[1] + lattice[8]*g[2]);
int ix = round (twoPiInv * dot (lattice+0, g));
int iy = round (twoPiInv * dot (lattice+3, g));
int iz = round (twoPiInv * dot (lattice+6, g));
ix = (ix + Mx)%Mx;
iy = (iy + My)%My;
iz = (iz + Mz)%Mz;
double gamma = 1.0;
if (fabs(G[0]) > 1.0e-10)
gamma *= (3.0/(G[0]*G[0]*G[0]*G[0])*(3.0 - 4.0*cos(G[0]) + cos(2.0*G[0])));
else
gamma *= 1.5;
if (fabs(G[1]) > 1.0e-10)
gamma *= (3.0/(G[1]*G[1]*G[1]*G[1])*(3.0 - 4.0*cos(G[1]) + cos(2.0*G[1])));
else
gamma *= 1.5;
if (fabs(G[2]) > 1.0e-10)
gamma *= (3.0/(G[2]*G[2]*G[2]*G[2])*(3.0 - 4.0*cos(G[2]) + cos(2.0*G[2])));
else
gamma *= 1.5;
gamma *= scale;
fft_box[(ix*My+iy)*Mz+iz] = coefs[i]/gamma;
}
// Execute the FFTW plan
fftwf_execute (plan);
// Destroy plan
fftwf_destroy_plan (plan);
// Now we have the coefficients in the FFT box. We must allocate a
// little bit larger box to hold the B-spline coefficients
UBspline_3d_s* restrict spline = malloc (sizeof (UBspline_3d_s));
spline->spcode = U3D;
spline->tcode = SINGLE_REAL;
Ugrid x_grid, y_grid, z_grid;
int Nx = Mx + 3;
int Ny = My + 3;
int Nz = Mz + 3;
x_grid.start = 0.0; x_grid.end = 1.0; x_grid.num = Mx;
x_grid.delta = 1.0/(double)Mx; x_grid.delta_inv = 1.0/x_grid.delta;
y_grid.start = 0.0; y_grid.end = 1.0; y_grid.num = My;
y_grid.delta = 1.0/(double)My; y_grid.delta_inv = 1.0/y_grid.delta;
z_grid.start = 0.0; z_grid.end = 1.0; z_grid.num = Mz;
z_grid.delta = 1.0/(double)Mz; z_grid.delta_inv = 1.0/z_grid.delta;
spline->x_grid = x_grid;
spline->y_grid = y_grid;
spline->z_grid = z_grid;
spline->x_stride = Ny*Nz;
spline->y_stride = Nz;
spline->xBC.lCode = PERIODIC; spline->xBC.rCode = PERIODIC;
spline->yBC.lCode = PERIODIC; spline->yBC.rCode = PERIODIC;
spline->zBC.lCode = PERIODIC; spline->zBC.rCode = PERIODIC;
#ifndef HAVE_SSE2
spline->coefs = malloc (sizeof(float)*Nx*Ny*Nz);
#else
posix_memalign ((void**)&spline->coefs, 16, sizeof(float)*Nx*Ny*Nz);
#endif
// Now copy data into spline coefficients, observing periodic boundary conditions
for (int ix=0; ix<Nx; ix++) {
int jx = (ix-1 + Mx)%Mx;
for (int iy=0; iy < Ny; iy++) {
int jy = (iy-1 + My)%My;
for (int iz=0; iz < Nz; iz++) {
int jz = (iz-1 + Mz)%Mz;
if (useReal)
spline->coefs[(ix*Ny+iy)*Nz+iz] =
crealf (fft_box[(jx*My+jy)*Mz+jz]);
else
spline->coefs[(ix*Ny+iy)*Nz+iz] =
cimagf (fft_box[(jx*My+jy)*Mz+jz]);
}
}
}
//free (alloc_ptr);
aligned_free (fft_box);
init_sse_data();
return spline;
}

View File

@ -0,0 +1,56 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BLIP_CREATE_H
#define BLIP_CREATE_H
#include "bspline_base.h"
#include "bspline_structs.h"
#include <stdbool.h>
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
//// Blip creation functions ////
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
UBspline_3d_s*
create_blip_3d_s (double *lattice, double *Gvecs,
complex_float *coefs, int numG,
double factor, bool useReal);
UBspline_3d_d*
create_blip_3d_d (double *lattice, double *Gvecs,
complex_double *coefs, int numG,
double factor, bool useReal);
UBspline_3d_c*
create_blip_3d_c (double *lattice, double *Gvecs,
complex_float *coefs, int numG,
double factor);
UBspline_3d_z*
create_blip_3d_z (double *lattice, double *Gvecs,
complex_double *coefs, int numG,
double factor);
#endif

58
src/einspline/bspline.h Normal file
View File

@ -0,0 +1,58 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_H
#define BSPLINE_H
#include "bspline_base.h"
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
//// Bspline structure definitions ////
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
#include "bspline_structs.h"
#include "multi_bspline_structs.h"
// Currently, some of the single-precision routines use SSE2 instructions
#ifdef HAVE_SSE2
#include "bspline_eval_sse_s.h"
#include "bspline_eval_sse_c.h"
#include "bspline_eval_sse_d.h"
#include "bspline_eval_sse_z.h"
#elif defined HAVE_SSE
#include "bspline_eval_sse_s.h"
#include "bspline_eval_sse_c.h"
#include "bspline_eval_std_d.h"
#include "bspline_eval_std_z.h"
#elif defined USE_ALTIVEC
#include "bspline_eval_altivec_s.h"
#include "bspline_eval_std_c.h"
#include "bspline_eval_std_d.h"
#include "bspline_eval_std_z.h"
#else
#include "bspline_eval_std_s.h"
#include "bspline_eval_std_c.h"
#include "bspline_eval_std_d.h"
#include "bspline_eval_std_z.h"
#endif
#include "bspline_create.h"
#include "multi_bspline_create.h"
#endif

View File

@ -0,0 +1,104 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_BASE_H
#define BSPLINE_BASE_H
#include "config.h"
#ifdef __cplusplus
#include <complex>
typedef std::complex<float> complex_float;
typedef std::complex<double> complex_double;
#else
#include <complex.h>
typedef complex float complex_float;
typedef complex double complex_double;
#endif
// Conventions:
// Postfixes:
// s: single precision real
// d: double precision real
// c: single precision complex
// z: double precision complex
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
//// Basic type declarations ////
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
typedef enum { PERIODIC, DERIV1, DERIV2, FLAT, NATURAL, ANTIPERIODIC } bc_code;
typedef enum { U1D , U2D , U3D ,
NU1D , NU2D , NU3D ,
MULTI_U1D , MULTI_U2D , MULTI_U3D,
MULTI_NU1D, MULTI_NU2D, MULTI_NU3D } spline_code;
typedef enum { SINGLE_REAL, DOUBLE_REAL, SINGLE_COMPLEX, DOUBLE_COMPLEX }
type_code;
typedef struct
{
bc_code lCode, rCode;
float lVal, rVal;
} BCtype_s;
typedef struct
{
bc_code lCode, rCode;
double lVal, rVal;
} BCtype_d;
typedef struct
{
bc_code lCode, rCode;
float lVal_r, lVal_i, rVal_r, rVal_i;
} BCtype_c;
typedef struct
{
bc_code lCode, rCode;
double lVal_r, lVal_i, rVal_r, rVal_i;
} BCtype_z;
typedef struct
{
double start, end;
int num;
// private
double delta, delta_inv;
} Ugrid;
typedef struct
{
spline_code sp_code;
type_code t_code;
void *restrict coefs;
} Bspline;
#ifdef __cplusplus
extern "C"
#endif
void
destroy_Bspline (void *spline);
#endif

View File

@ -0,0 +1,38 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007-2010 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_BASE_CUDA_H
#define BSPLINE_BASE_CUDA_H
#include <cuda.h>
#if CUDA_VERSION < 3000 /* 3.0 */
typedef struct
{
double x,y,z;
} double3;
typedef struct
{
double x,y,z,w;
} double4;
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,153 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_CREATE_H
#define BSPLINE_CREATE_H
#include "bspline_base.h"
#include "bspline_structs.h"
#ifdef __cplusplus
extern "C" {
#endif
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
//// Spline creation functions ////
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
/////////////////////////////////////
// Uniform, single precision, real //
/////////////////////////////////////
// Create 1D uniform single-precision, real Bspline
UBspline_1d_s *
create_UBspline_1d_s (Ugrid x_grid, BCtype_s xBC, float *data);
// Create 2D uniform single-precision, real Bspline
UBspline_2d_s *
create_UBspline_2d_s (Ugrid x_grid, Ugrid y_grid,
BCtype_s xBC, BCtype_s yBC,
float *data);
// Create 3D uniform single-precision, real Bspline
UBspline_3d_s *
create_UBspline_3d_s (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_s xBC, BCtype_s yBC, BCtype_s zBC,
float *data);
void
recompute_UBspline_1d_s (UBspline_1d_s* spline, float *data);
void
recompute_UBspline_2d_s (UBspline_2d_s* spline, float *data);
void
recompute_UBspline_3d_s (UBspline_3d_s* spline, float *data);
/////////////////////////////////////
// Uniform, double precision, real //
/////////////////////////////////////
// Create 1D uniform single-precision, real Bspline
UBspline_1d_d *
create_UBspline_1d_d (Ugrid x_grid, BCtype_d xBC, double *data);
// Create 2D uniform single-precision, real Bspline
UBspline_2d_d *
create_UBspline_2d_d (Ugrid x_grid, Ugrid y_grid,
BCtype_d xBC, BCtype_d yBC,
double *data);
// Create 3D uniform single-precision, real Bspline
UBspline_3d_d *
create_UBspline_3d_d (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_d xBC, BCtype_d yBC, BCtype_d zBC,
double *data);
void
recompute_UBspline_1d_d (UBspline_1d_d* spline, double *data);
void
recompute_UBspline_2d_d (UBspline_2d_d* spline, double *data);
void
recompute_UBspline_3d_d (UBspline_3d_d* spline, double *data);
///////////////////////////////////////
// Uniform, single precision, complex//
///////////////////////////////////////
// Create 1D uniform single-precision, real Bspline
UBspline_1d_c *
create_UBspline_1d_c (Ugrid x_grid, BCtype_c xBC, complex_float *data);
// Create 2D uniform single-precision, real Bspline
UBspline_2d_c *
create_UBspline_2d_c (Ugrid x_grid, Ugrid y_grid,
BCtype_c xBC, BCtype_c yBC,
complex_float *data);
// Create 3D uniform single-precision, real Bspline
UBspline_3d_c *
create_UBspline_3d_c (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_c xBC, BCtype_c yBC, BCtype_c zBC,
complex_float *data);
void
recompute_UBspline_1d_c (UBspline_1d_c* spline, complex_float *data);
void
recompute_UBspline_2d_c (UBspline_2d_c* spline, complex_float *data);
void
recompute_UBspline_3d_c (UBspline_3d_c* spline, complex_float *data);
///////////////////////////////////////
// Uniform, double precision, complex//
///////////////////////////////////////
// Create 1D uniform double-precision, complex Bspline
UBspline_1d_z *
create_UBspline_1d_z (Ugrid x_grid, BCtype_z xBC, complex_double *data);
// Create 2D uniform double-precision, complex Bspline
UBspline_2d_z *
create_UBspline_2d_z (Ugrid x_grid, Ugrid y_grid,
BCtype_z xBC, BCtype_z yBC,
complex_double *data);
// Create 3D uniform double-precision, complex Bspline
UBspline_3d_z *
create_UBspline_3d_z (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_z xBC, BCtype_z yBC, BCtype_z zBC,
complex_double *data);
void
recompute_UBspline_1d_z (UBspline_1d_z* spline, complex_double *data);
void
recompute_UBspline_2d_z (UBspline_2d_z* spline, complex_double *data);
void
recompute_UBspline_3d_z (UBspline_3d_z* spline, complex_double *data);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,403 @@
#include <stdio.h>
#include "bspline_base.h"
#include "bspline_structs.h"
#include "bspline_structs_cuda.h"
__device__ double Bcuda[48];
__constant__ float Acuda[48];
// #include "bspline_cuda_s_impl.h"
// #include "bspline_cuda_c_impl.h"
// #include "bspline_cuda_d_impl.h"
// #include "bspline_cuda_z_impl.h"
extern "C" UBspline_3d_c_cuda*
create_UBspline_3d_c_cuda (UBspline_3d_c* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
UBspline_3d_c_cuda *cuda_spline =
(UBspline_3d_c_cuda*) malloc (sizeof (UBspline_3d_c_cuda));
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = ((Nz+31)/32)*32;
cuda_spline->stride.x = Ny*N;
cuda_spline->stride.y = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*N*sizeof(std::complex<float>);
cudaMalloc((void**)&(cuda_spline->coefs), size);
std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++) {
for (int iz=0; iz<Nz; iz++)
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + iz] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +iz];
for (int isp=Nz; isp < N; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + isp] = 0.0;
}
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
cuda_spline->stride.x = 2*Ny*N;
cuda_spline->stride.y = 2*N;
return cuda_spline;
}
extern "C" UBspline_3d_c_cuda*
create_UBspline_3d_c_cuda_conv (UBspline_3d_z* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
UBspline_3d_c_cuda *cuda_spline =
(UBspline_3d_c_cuda*) malloc (sizeof (UBspline_3d_c_cuda));
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = ((Nz+31)/32) * 32;
cuda_spline->stride.x = Ny*N;
cuda_spline->stride.y = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*N*sizeof(std::complex<float>);
cudaMalloc((void**)&(cuda_spline->coefs), size);
std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++) {
std::complex<double> z = spline->coefs[ix*spline->x_stride +
iy*spline->y_stride + iz];
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + iz] = std::complex<float>(z.real(), z.imag());
for (int iz=Nz; iz < N; iz++)
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + iz] = 0.0;
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
cuda_spline->stride.x = 2*Ny*N;
cuda_spline->stride.y = 2*N;
return cuda_spline;
}
extern "C" UBspline_3d_s_cuda*
create_UBspline_3d_s_cuda (UBspline_3d_s* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
UBspline_3d_s_cuda *cuda_spline =
(UBspline_3d_s_cuda*) malloc (sizeof (UBspline_3d_s_cuda));
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = ((Nz+31)/32)*32;
cuda_spline->stride.x = Ny*N;
cuda_spline->stride.y = N;
cuda_spline->stride.z = 1;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
float *spline_buff = (float*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + iz] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride + iz];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" UBspline_3d_s_cuda*
create_UBspline_3d_s_cuda_conv (UBspline_3d_d* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
UBspline_3d_s_cuda *cuda_spline =
(UBspline_3d_s_cuda*) malloc (sizeof (UBspline_3d_s_cuda));
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = ((Nz+31)/32)*32;
cuda_spline->stride.x = Ny*N;
cuda_spline->stride.y = N;
cuda_spline->stride.z = 1;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "Failed to allocate %ld memory for GPU spline coefficients. Error %s\n",
size, cudaGetErrorString(err));
abort();
}
float *spline_buff = (float*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + iz] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride + iz];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "Failed to copy spline to GPU memory. Error: %s\n",
cudaGetErrorString(err));
abort();
}
free(spline_buff);
return cuda_spline;
}
extern "C" UBspline_3d_d_cuda*
create_UBspline_3d_d_cuda (UBspline_3d_d* spline)
{
double B_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
UBspline_3d_d_cuda *cuda_spline =
(UBspline_3d_d_cuda*) malloc (sizeof (UBspline_3d_d_cuda));
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = ((Nz+31)/32)*32;
cuda_spline->stride.x = Ny*N;
cuda_spline->stride.y = N;
cuda_spline->stride.z = 1;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*N*sizeof(double);
cudaMalloc((void**)&(cuda_spline->coefs), size);
double *spline_buff = (double*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + iz] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride + iz];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" UBspline_3d_z_cuda*
create_UBspline_3d_z_cuda (UBspline_3d_z* spline)
{
double B_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
UBspline_3d_z_cuda *cuda_spline =
(UBspline_3d_z_cuda*) malloc (sizeof (UBspline_3d_z_cuda));
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = ((Nz+31)/32)*32;
cuda_spline->stride.x = Ny*N;
cuda_spline->stride.y = N;
cuda_spline->stride.z = 1;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*N*sizeof(std::complex<double>);
cudaMalloc((void**)&(cuda_spline->coefs), size);
std::complex<double> *spline_buff = (std::complex<double>*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y + iz] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride + iz];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
cuda_spline->stride.x = 2*Ny*N;
cuda_spline->stride.y = 2*N;
cuda_spline->stride.z = 2;
free(spline_buff);
return cuda_spline;
}

View File

@ -0,0 +1,26 @@
#ifndef BSPLINE_CREATE_CUDA_H
#define BSPLINE_CREATE_CUDA_H
#include "bspline_structs_cuda.h"
extern "C" UBspline_3d_s_cuda*
create_UBspline_3d_s_cuda (UBspline_3d_s* spline);
extern "C" UBspline_3d_s_cuda*
create_UBspline_3d_s_cuda_conv (UBspline_3d_d* spline);
extern "C" UBspline_3d_c_cuda*
create_UBspline_3d_c_cuda (UBspline_3d_c* spline);
extern "C" UBspline_3d_c_cuda*
create_UBspline_3d_c_cuda_conv (UBspline_3d_z* spline);
extern "C" UBspline_3d_d_cuda*
create_UBspline_3d_d_cuda (UBspline_3d_d* spline);
extern "C" UBspline_3d_z_cuda*
create_UBspline_3d_z_cuda (UBspline_3d_z* spline);
#endif

View File

@ -0,0 +1,742 @@
#ifndef BSPLINE_CUDA_S_IMPL_H
#define BSPLINE_CUDA_S_IMPL_H
//#include <stdio.h>
#include "bspline.h"
#include "bspline_create_cuda.h"
__global__ static void
eval_multi_UBspline_3d_s_kernel
(float *pos, float3 drInv, float *coefs, float *vals[],
uint3 dim, uint2 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
//index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
//index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
//index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
if (off < N) {
float val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = val;
}
}
__global__ static void
eval_multi_UBspline_3d_s_sign_kernel
(float *pos, float *sign, float3 drInv, float *coefs, float *vals[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
__shared__ float mysign;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mysign = sign[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
//index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
//index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
//index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
if (off < N) {
float val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = mysign*val;
}
}
__global__ static void
eval_multi_UBspline_3d_s_vgh_kernel
(float *pos, float3 drInv, float *coefs,
float *vals[], float *grads[], float *hess[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad, *myhess;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
float c = base[k*strides.z];
v += abc[n+0] * c;
g0 += abc[n+64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
abc[3*thr+0] = g0;
abc[3*thr+1] = g1;
abc[3*thr+2] = g2;
__syncthreads();
for (int i=0; i<3; i++) {
int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 3*N)
mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
__syncthreads();
// Write Hessians
abc[6*thr+0] = h00;
abc[6*thr+1] = h01;
abc[6*thr+2] = h02;
abc[6*thr+3] = h11;
abc[6*thr+4] = h12;
abc[6*thr+5] = h22;
__syncthreads();
for (int i=0; i<6; i++) {
int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 6*N)
myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
}
extern "C" void
eval_multi_UBspline_3d_s_cuda (UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
extern "C" void
eval_multi_UBspline_3d_s_sign_cuda (UBspline_3d_s_cuda *spline,
float *pos_d, float *sign_d,
float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_UBspline_3d_s_sign_kernel<<<dimGrid,dimBlock>>>
(pos_d, sign_d, spline->gridInv, spline->coefs,
vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
extern "C" void
eval_multi_UBspline_3d_s_vgh_cuda (UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], float *grads_d[],
float *hess_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, hess_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_vgh_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_UBspline_3d_s_vgl_kernel
(float *pos, float3 drInv, float *coefs, float Linv[],
float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad_lapl;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad_lapl = grad_lapl[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
float c = base[k*strides.z];
v += abc[n+ 0] * c;
g0 += abc[n+ 64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
__shared__ float G[3][3], GGt[3][3];
int i0 = threadIdx.x/3;
int i1 = threadIdx.x - 3*i0;
if (threadIdx.x < 9)
G[i0][i1] = Linv[threadIdx.x];
__syncthreads();
if (threadIdx.x < 9)
GGt[i0][i1] = (G[i0][0]*G[i1][0] +
G[i0][1]*G[i1][1] +
G[i0][2]*G[i1][2]);
__syncthreads();
if (off < N) {
// Store gradients back to global memory
mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
mygrad_lapl[off+1*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
mygrad_lapl[off+2*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
// Store laplacians back to global memory
// Hessian = H00 H01 H02 H11 H12 H22
// Matrix = [0 1 2]
// [1 3 4]
// [2 4 5]
// laplacian = Trace(GGt*Hessian)
mygrad_lapl[off+3*row_stride] =
(GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
}
}
extern "C" void
eval_multi_UBspline_3d_s_vgl_cuda
(UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_UBspline_3d_s_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_vgl_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_UBspline_3d_s_vgl_sign_kernel
(float *pos, float sign[], float3 drInv, float *coefs, float Linv[],
float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad_lapl, mysign;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad_lapl = grad_lapl[ir];
mysign = sign[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
float c = base[k*strides.z];
v += abc[n+ 0] * c;
g0 += abc[n+ 64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = mysign * v;
}
__shared__ float G[3][3], GGt[3][3];
int i0 = threadIdx.x/3;
int i1 = threadIdx.x - 3*i0;
if (threadIdx.x < 9)
G[i0][i1] = Linv[threadIdx.x];
__syncthreads();
if (threadIdx.x < 9)
GGt[i0][i1] = (G[i0][0]*G[i1][0] +
G[i0][1]*G[i1][1] +
G[i0][2]*G[i1][2]);
__syncthreads();
if (off < N) {
// Store gradients back to global memory
mygrad_lapl[off+0*row_stride] = mysign*(G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2);
mygrad_lapl[off+1*row_stride] = mysign*(G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2);
mygrad_lapl[off+2*row_stride] = mysign*(G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2);
// Store laplacians back to global memory
// Hessian = H00 H01 H02 H11 H12 H22
// Matrix = [0 1 2]
// [1 3 4]
// [2 4 5]
// laplacian = Trace(GGt*Hessian)
mygrad_lapl[off+3*row_stride] = mysign *
(GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
}
}
extern "C" void
eval_multi_UBspline_3d_s_vgl_sign_cuda
(UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_UBspline_3d_s_vgl_sign_kernel<<<dimGrid,dimBlock>>>
(pos_d, sign_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_vgl_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
#endif

View File

@ -0,0 +1,207 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "config.h"
/*****************
/* SSE Data */
/*****************/
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#define _XOPEN_SOURCE 600
#ifndef __USE_XOPEN2K
#define __USE_XOPEN2K
#endif
#include <stdlib.h>
#ifdef HAVE_SSE
#include <xmmintrin.h>
// Single-precision version of matrices
__m128 *restrict A_s = (__m128 *)0;
// There is a problem with alignment of global variables in shared
// libraries on 32-bit machines.
// __m128 A0, A1, A2, A3, dA0, dA1, dA2, dA3, d2A0, d2A1, d2A2, d2A3;
#endif
#ifdef HAVE_SSE2
// Double-precision version of matrices
#include <emmintrin.h>
__m128d *restrict A_d = (__m128d *)0;
// There is a problem with alignment of global variables in shared
// libraries on 32-bit machines.
//__m128d A0_01, A0_23, A1_01, A1_23, A2_01, A2_23, A3_01, A3_23,
// dA0_01, dA0_23, dA1_01, dA1_23, dA2_01, dA2_23, dA3_01, dA3_23,
// d2A0_01, d2A0_23, d2A1_01, d2A1_23, d2A2_01, d2A2_23, d2A3_01, d2A3_23;
#endif
void init_sse_data()
{
#ifdef HAVE_SSE
if (A_s == 0) {
posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12));
A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 );
A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 );
A_s[1] = _mm_setr_ps ( 4.0/6.0, 0.0/6.0, -6.0/6.0, 3.0/6.0 );
A_s[2] = _mm_setr_ps ( 1.0/6.0, 3.0/6.0, 3.0/6.0, -3.0/6.0 );
A_s[3] = _mm_setr_ps ( 0.0/6.0, 0.0/6.0, 0.0/6.0, 1.0/6.0 );
A_s[4] = _mm_setr_ps ( -0.5, 1.0, -0.5, 0.0 );
A_s[5] = _mm_setr_ps ( 0.0, -2.0, 1.5, 0.0 );
A_s[6] = _mm_setr_ps ( 0.5, 1.0, -1.5, 0.0 );
A_s[7] = _mm_setr_ps ( 0.0, 0.0, 0.5, 0.0 );
A_s[8] = _mm_setr_ps ( 1.0, -1.0, 0.0, 0.0 );
A_s[9] = _mm_setr_ps ( -2.0, 3.0, 0.0, 0.0 );
A_s[10] = _mm_setr_ps ( 1.0, -3.0, 0.0, 0.0 );
A_s[11] = _mm_setr_ps ( 0.0, 1.0, 0.0, 0.0 );
}
#endif
#ifdef HAVE_SSE2
if (A_d == 0) {
posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*32));
A_d[ 0] = _mm_setr_pd ( 3.0/6.0, -1.0/6.0 );
A_d[ 1] = _mm_setr_pd ( 1.0/6.0, -3.0/6.0 );
A_d[ 2] = _mm_setr_pd ( -6.0/6.0, 3.0/6.0 );
A_d[ 3] = _mm_setr_pd ( 4.0/6.0, 0.0/6.0 );
A_d[ 4] = _mm_setr_pd ( 3.0/6.0, -3.0/6.0 );
A_d[ 5] = _mm_setr_pd ( 1.0/6.0, 3.0/6.0 );
A_d[ 6] = _mm_setr_pd ( 0.0/6.0, 1.0/6.0 );
A_d[ 7] = _mm_setr_pd ( 0.0/6.0, 0.0/6.0 );
A_d[ 8] = _mm_setr_pd ( -0.5, 0.0 );
A_d[ 9] = _mm_setr_pd ( -0.5, 1.0 );
A_d[10] = _mm_setr_pd ( 1.5, 0.0 );
A_d[11] = _mm_setr_pd ( 0.0, -2.0 );
A_d[12] = _mm_setr_pd ( -1.5, 0.0 );
A_d[13] = _mm_setr_pd ( 0.5, 1.0 );
A_d[14] = _mm_setr_pd ( 0.5, 0.0 );
A_d[15] = _mm_setr_pd ( 0.0, 0.0 );
A_d[16] = _mm_setr_pd ( 0.0, 0.0 );
A_d[17] = _mm_setr_pd ( 1.0, -1.0 );
A_d[18] = _mm_setr_pd ( 0.0, 0.0 );
A_d[19] = _mm_setr_pd ( -2.0, 3.0 );
A_d[20] = _mm_setr_pd ( 0.0, 0.0 );
A_d[21] = _mm_setr_pd ( 1.0, -3.0 );
A_d[22] = _mm_setr_pd ( 0.0, 0.0 );
A_d[23] = _mm_setr_pd ( 0.0, 1.0 );
A_d[25] = _mm_setr_pd ( -1.0, 0.0 );
A_d[26] = _mm_setr_pd ( 0.0, 0.0 );
A_d[27] = _mm_setr_pd ( 3.0, 0.0 );
A_d[28] = _mm_setr_pd ( 0.0, 0.0 );
A_d[29] = _mm_setr_pd ( -3.0, 0.0 );
A_d[30] = _mm_setr_pd ( 0.0, 0.0 );
A_d[31] = _mm_setr_pd ( 1.0, 0.0 );
}
#endif
}
#ifdef USE_ALTIVEC
vector float A0 = (vector float) ( -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0);
vector float A1 = (vector float) ( 3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0);
vector float A2 = (vector float) ( -3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0);
vector float A3 = (vector float) ( 1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0);
/* vector float A0 = (vector float) ( -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0); */
/* vector float A1 = (vector float) ( 3.0/6.0, -6.0/6.0, 3.0/6.0, 0.0/6.0); */
/* vector float A2 = (vector float) ( -3.0/6.0, 0.0/6.0, 3.0/6.0, 0.0/6.0); */
/* vector float A3 = (vector float) ( 1.0/6.0, 4.0/6.0, 1.0/6.0, 0.0/6.0); */
/* vector float A0 = (vector float) ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0); */
/* vector float A1 = (vector float) ( 4.0/6.0, 0.0/6.0, -6.0/6.0, 3.0/6.0); */
/* vector float A2 = (vector float) ( 1.0/6.0, 3.0/6.0, 3.0/6.0, -3.0/6.0); */
/* vector float A3 = (vector float) ( 0.0/6.0, 0.0/6.0, 0.0/6.0, 1.0/6.0); */
vector float dA0 = (vector float) ( 0.0, -0.5, 1.0, -0.5 );
vector float dA1 = (vector float) ( 0.0, 1.5, -2.0, 0.0 );
vector float dA2 = (vector float) ( 0.0, -1.5, 1.0, 0.5 );
vector float dA3 = (vector float) ( 0.0, 0.5, 0.0, 0.0 );
vector float d2A0 = (vector float) ( 0.0, 0.0, -1.0, 1.0 );
vector float d2A1 = (vector float) ( 0.0, 0.0, 3.0, -2.0 );
vector float d2A2 = (vector float) ( 0.0, 0.0, -3.0, 1.0 );
vector float d2A3 = (vector float) ( 0.0, 0.0, 1.0, 0.0 );
#endif
/*****************/
/* Standard Data */
/*****************/
//////////////////////
// Single precision //
//////////////////////
const float A44f[16] =
{ -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0 };
const float* restrict Af = A44f;
const float dA44f[16] =
{ 0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0 };
const float* restrict dAf = dA44f;
const float d2A44f[16] =
{ 0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
const float* restrict d2Af = d2A44f;
const float d3A44f[16] =
{ 0.0, 0.0, 0.0, -1.0,
0.0, 0.0, 0.0, 3.0,
0.0, 0.0, 0.0, -3.0,
0.0, 0.0, 0.0, 1.0};
const float* restrict d3Af = d3A44f;
//////////////////////
// Double precision //
//////////////////////
const double A44d[16] =
{ -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0 };
const double* restrict Ad = A44d;
const double dA44d[16] =
{ 0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0 };
const double* restrict dAd = dA44d;
const double d2A44d[16] =
{ 0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
const double* restrict d2Ad = d2A44d;
const double d3A44d[16] =
{ 0.0, 0.0, 0.0, -1.0,
0.0, 0.0, 0.0, 3.0,
0.0, 0.0, 0.0, -3.0,
0.0, 0.0, 0.0, 1.0};
const double* restrict d3Ad = d3A44d;

View File

@ -0,0 +1,498 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_EVAL_SSE_S_H
#define BSPLINE_EVAL_SSE_S_H
#include <stdio.h>
#include <math.h>
#include <ppc_intrinsics.h>
extern vector float A0, A1, A2, A3;
extern vector float dA0, dA1, dA2, dA3;
extern vector float d2A0, d2A1, d2A2, d2A3;
extern const float* restrict Af;
extern const float* restrict dAf;
extern const float* restrict d2Af;
inline vector float
MakeVec (double a, double b, double c, double d)
{
union
{
float scalars[vec_step(vector float)];
vector float v;
} buffer;
buffer.scalars[0] = a;
buffer.scalars[1] = b;
buffer.scalars[2] = c;
buffer.scalars[3] = d;
return buffer.v;
}
void
GetVec (vector unsigned int i, int *i0, int *i1, int *i2, int *i3)
{
union
{
unsigned int scalars[vec_step(vector float)];
vector unsigned int v;
} buffer;
buffer.v = i;
*i0 = buffer.scalars[0];
*i1 = buffer.scalars[1];
*i2 = buffer.scalars[2];
*i3 = buffer.scalars[3];
}
vector unsigned char perm0 = (vector unsigned char)
( 0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27 );
vector unsigned char perm1 = (vector unsigned char)
(4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31 );
vector unsigned char perm2 = (vector unsigned char)
( 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 );
vector unsigned char perm3 = (vector unsigned char)
( 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 );
vector float zero = (vector float) (0.0, 0.0, 0.0, 0.0);
inline
vector float LoadUnaligned(float *target )
{
vector float MSQ, LSQ, result;
vector unsigned char mask;
MSQ = vec_ld(0, target); // most significant quadword
LSQ = vec_ld(15, target); // least significant quadword
mask = vec_lvsl(0, target); // create the permute mask
result = vec_perm(MSQ, LSQ, mask); // align the data
// fprintf (stderr, "result = %vf\n", result);
// fprintf (stderr, "target = %f %f %f %f\n", target[0], target[1], target[2], target[3]);
return result;
}
/// SSE3 add "horizontal add" instructions, which makes things
/// simpler and faster
// Use plain-old SSE instructions
#define _TRANSPOSE4(_v0, _v1, _v2, _v3) \
do { \
vector float _t0 = vec_perm (_v0, _v1, perm0); \
vector float _t1 = vec_perm (_v0, _v1, perm1); \
vector float _t2 = vec_perm (_v2, _v3, perm0); \
vector float _t3 = vec_perm (_v2, _v3, perm1); \
_v0 = vec_perm (_t0, _t2, perm2); \
_v1 = vec_perm (_t1, _t3, perm2); \
_v2 = vec_perm (_t0, _t2, perm3); \
_v3 = vec_perm (_t1, _t3, perm3); \
} while (0);
#define _MM_MATVEC4_PS(M0, M1, M2, M3, v, r) \
do { \
vector float r0 = vec_madd (M0, v, zero); \
vector float r1 = vec_madd (M1, v, zero); \
vector float r2 = vec_madd (M2, v, zero); \
vector float r3 = vec_madd (M3, v, zero); \
_TRANSPOSE4 (r0, r1, r2, r3); \
r = vec_add (vec_add(r0, r1), vec_add(r2, r3)); \
} while (0);
#define _MM_DOT4_PS(A, B, p) \
do { \
vector float _t = vec_madd (A, B, zero); \
vector float _alo = vec_mergel (_t, _t); \
vector float _ahi = vec_mergeh (_t, _t); \
vector float _a = vec_add (_alo, _ahi); \
vector float _rlo = vec_mergel (_a, _a); \
vector float _rhi = vec_mergeh (_a, _a); \
vector float _r = vec_add (_rlo, _rhi); \
vector float _r2 = vec_splat (_r, 0); \
vec_ste (_r2, 0, (p)); \
} while(0);
#define _4DOTS(_u0, _v0, _u1, _v1, _u2, _v2, _u3, _v3, result) \
do { \
vector float _w0 = vec_madd (_u0, _v0, zero); \
vector float _w1 = vec_madd (_u1, _v1, zero); \
vector float _w2 = vec_madd (_u2, _v2, zero); \
vector float _w3 = vec_madd (_u3, _v3, zero); \
_TRANSPOSE4 (_w0, _w1, _w2, _w3); \
result = vec_add (vec_add(_w0, _w1), vec_add(_w2, _w3)); \
} while(0);
/************************************************************/
/* 1D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_1d_s (UBspline_1d_s * restrict spline,
double x, float* restrict val)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
float* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
}
/* Value and first derivative */
inline void
eval_UBspline_1d_s_vg (UBspline_1d_s * restrict spline, double x,
float* restrict val, float* restrict grad)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
float* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
}
/* Value, first derivative, and second derivative */
inline void
eval_UBspline_1d_s_vgl (UBspline_1d_s * restrict spline, double x,
float* restrict val, float* restrict grad,
float* restrict lapl)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float* restrict coefs = spline->coefs;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
*lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(coefs[i+0]*(d2Af[ 2]*tp[2] + d2Af[ 3]*tp[3])+
coefs[i+1]*(d2Af[ 6]*tp[2] + d2Af[ 7]*tp[3])+
coefs[i+2]*(d2Af[10]*tp[2] + d2Af[11]*tp[3])+
coefs[i+3]*(d2Af[14]*tp[2] + d2Af[15]*tp[3]));
}
/************************************************************/
/* 2D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_2d_s (UBspline_2d_s * restrict spline,
double x, double y, float* restrict val)
{
}
/* Value and gradient */
inline void
eval_UBspline_2d_s_vg (UBspline_2d_s * restrict spline,
double x, double y,
float* restrict val, float* restrict grad)
{
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_2d_s_vgl (UBspline_2d_s * restrict spline,
double x, double y, float* restrict val,
float* restrict grad, float* restrict lapl)
{
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_2d_s_vgh (UBspline_2d_s * restrict spline,
double x, double y, float* restrict val,
float* restrict grad, float* restrict hess)
{
}
/************************************************************/
/* 3D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_3d_s (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val)
{
}
/* Value and gradient */
inline void
eval_UBspline_3d_s_vg (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val, float* restrict grad)
{
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_3d_s_vgl (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val, float* restrict grad, float* restrict lapl)
{
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_3d_s_vgh (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val, float* restrict grad,
float* restrict hess)
{
vec_dst (&A0, (12<<3) | (1<<8), 0);
/// SSE mesh point determination
vector float xyz = MakeVec (x, y, z, 0.0);
vector float x0y0z0 = MakeVec ( spline->x_grid.start, spline->y_grid.start,
spline->z_grid.start, 0.0);
vector float delta_inv = MakeVec( spline->x_grid.delta_inv,
spline->y_grid.delta_inv,
spline->z_grid.delta_inv, 0.0 );
xyz = vec_sub (xyz, x0y0z0);
// ux = (x - x0)/delta_x and same for y and z
vector float uxuyuz = vec_madd (xyz, delta_inv, zero);
// fprintf (stderr, "uxuyuz = %vf\n", uxuyuz);
// intpart = trunc (ux, uy, uz)
vector float intpart = vec_floor (uxuyuz);
// fprintf (stderr, "intpart = %vf\n", intpart);
vector unsigned int ixiyiz = vec_ctu (intpart, 0);
// Store to memory for use in C expressions
// xmm registers are stored to memory in reverse order
int ix, iy, iz, dummy;
//fprintf (stderr, "ixiyiz = %vld\n", ixiyiz);
GetVec (ixiyiz, &ix, &iy, &iz, &dummy);
// fprintf (stderr, "ix = %d iy = %d iz = %d\n", ix, iy, iz);
int xs = spline->x_stride;
int ys = spline->y_stride;
// This macro is used to give the pointer to coefficient data.
// i and j should be in the range [0,3]. Coefficients are read four
// at a time, so no k value is needed.
#define P(i,j) ((float*)spline->coefs+(ix+(i))*xs+(iy+(j))*ys+(iz))
// Prefetch the data from main memory into cache so it's available
// when we need to use it.
int control_word;
control_word = (2<<3) | (4<<8) | ((4*ys) << 16);
// fprintf (stderr, "control word = %x\n", control_word);
// fprintf (stderr, "ys = %d P(0,1)-P(0,0) = %d\n", ys,
// P(0,1)-P(0,0));
void *ptr = P(0,0);
__dcbt (P(0,0), 0); __dcbt (P(0,1), 0); __dcbt (P(0,2), 0); __dcbt (P(0,3), 0);
__dcbt (P(0,0), 12); __dcbt (P(0,1),12); __dcbt (P(0,2),12); __dcbt (P(0,3),12);
__dcbt (P(1,0), 0); __dcbt (P(1,1), 0); __dcbt (P(1,2), 0); __dcbt (P(1,3), 0);
__dcbt (P(1,0), 12); __dcbt (P(1,1),12); __dcbt (P(1,2),12); __dcbt (P(1,3),12);
__dcbt (P(2,0), 0); __dcbt (P(2,1), 0); __dcbt (P(2,2), 0); __dcbt (P(2,3), 0);
__dcbt (P(2,0), 12); __dcbt (P(2,1),12); __dcbt (P(2,2),12); __dcbt (P(2,3),12);
__dcbt (P(3,0), 0); __dcbt (P(3,1), 0); __dcbt (P(3,2), 0); __dcbt (P(3,3), 0);
__dcbt (P(3,0), 12); __dcbt (P(3,1),12); __dcbt (P(3,2),12); __dcbt (P(3,3),12);
// vec_dstt (P(0,0), control_word, 0);
// vec_dstt (P(1,0), control_word, 1);
// vec_dstt (P(2,0), control_word, 2);
// vec_dstt (P(3,0), control_word, 3);
// // Now compute the vectors:
// // tpx = [t_x^3 t_x^2 t_x 1]
// // tpy = [t_y^3 t_y^2 t_y 1]
// // tpz = [t_z^3 t_z^2 t_z 1]
vector float txtytz = vec_sub (uxuyuz, intpart);
vector float one = (vector float) ( 1.0, 1.0, 1.0, 0.0);
vector float t2 = vec_madd (txtytz, txtytz, zero);
vector float t3 = vec_madd (t2, txtytz, zero);
// vector float tpx = t3;
// vector float tpy = t2;
// vector float tpz = txtytz;
// vector float z2 = one;
// _TRANSPOSE4(z2, tpz, tpy, tpx);
vector float tpx = t3;
vector float tpy = t2;
vector float tpz = txtytz;
vector float z2 = one;
_TRANSPOSE4(tpx, tpy, tpz, z2);
// fprintf (stderr, "txtytz = %vf\n", txtytz);
// fprintf (stderr, "tpxyz %vf %vf %vf\n", tpx, tpy, tpz);
// fprintf (stderr, "ix,iy,iz = %d, %d, %d\n", ix, iy, iz);
// a = A * tpx, b = A * tpy, c = A * tpz
// da = dA * tpx, db = dA * tpy, dc = dA * tpz, etc.
// A is 4x4 matrix given by the rows A0, A1, A2, A3
vector float a, b, c, da, db, dc, d2a, d2b, d2c,
cP[4], dcP[4], d2cP[4], bcP, dbcP, bdcP, d2bcP, dbdcP, bd2cP,
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// x-dependent vectors
_MM_MATVEC4_PS ( A0, A1, A2, A3, tpx, a);
_MM_MATVEC4_PS ( dA0, dA1, dA2, dA3, tpx, da);
_MM_MATVEC4_PS (d2A0, d2A1, d2A2, d2A3, tpx, d2a);
// y-dependent vectors
_MM_MATVEC4_PS ( A0, A1, A2, A3, tpy, b);
_MM_MATVEC4_PS ( dA0, dA1, dA2, dA3, tpy, db);
_MM_MATVEC4_PS (d2A0, d2A1, d2A2, d2A3, tpy, d2b);
// z-dependent vectors
_MM_MATVEC4_PS ( A0, A1, A2, A3, tpz, c);
_MM_MATVEC4_PS ( dA0, dA1, dA2, dA3, tpz, dc);
_MM_MATVEC4_PS (d2A0, d2A1, d2A2, d2A3, tpz, d2c);
// fprintf (stderr, "a = %vf\n", a);
// fprintf (stderr, "b = %vf\n", b);
// fprintf (stderr, "c = %vf\n", c);
// Compute cP, dcP, and d2cP products 1/4 at a time to maximize
// register reuse and avoid rerereading from memory or cache.
// 1st quarter
tmp0 = LoadUnaligned (P(0,0));
tmp1 = LoadUnaligned (P(0,1));
tmp2 = LoadUnaligned (P(0,2));
tmp3 = LoadUnaligned (P(0,3));
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, c, cP[0]);
// fprintf (stderr, "cP[0] = %vf\n", cP[0]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, dc, dcP[0]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[0]);
// 2nd quarter
tmp0 = LoadUnaligned (P(1,0));
tmp1 = LoadUnaligned (P(1,1));
tmp2 = LoadUnaligned (P(1,2));
tmp3 = LoadUnaligned (P(1,3));
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, c, cP[1]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, dc, dcP[1]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[1]);
// 3rd quarter
tmp0 = LoadUnaligned (P(2,0));
tmp1 = LoadUnaligned (P(2,1));
tmp2 = LoadUnaligned (P(2,2));
tmp3 = LoadUnaligned (P(2,3));
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, c, cP[2]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, dc, dcP[2]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[2]);
// 4th quarter
tmp0 = LoadUnaligned (P(3,0));
tmp1 = LoadUnaligned (P(3,1));
tmp2 = LoadUnaligned (P(3,2));
tmp3 = LoadUnaligned (P(3,3));
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, c, cP[3]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, dc, dcP[3]);
_MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[3]);
// Now compute bcP, dbcP, bdcP, d2bcP, bd2cP, and dbdc products
_MM_MATVEC4_PS ( cP[0], cP[1], cP[2], cP[3], b, bcP);
_MM_MATVEC4_PS ( cP[0], cP[1], cP[2], cP[3], db, dbcP);
_MM_MATVEC4_PS ( dcP[0], dcP[1], dcP[2], dcP[3], b, bdcP);
_MM_MATVEC4_PS ( cP[0], cP[1], cP[2], cP[3], d2b, d2bcP);
_MM_MATVEC4_PS (d2cP[0], d2cP[1], d2cP[2], d2cP[3], b, bd2cP);
_MM_MATVEC4_PS ( dcP[0], dcP[1], dcP[2], dcP[3], db, dbdcP);
vector float valgrad, hess4;
// fprintf (stderr, "a = %vf\n", a);
// fprintf (stderr, "bcP = %vf\n", bcP);
_4DOTS (a, bcP, da, bcP, a, dbcP, a, bdcP, valgrad);
// fprintf (stderr, "valgrad = %vf\n", valgrad);
tmp0 = vec_splat (valgrad, 0); vec_ste (tmp0, 0, val);
tmp0 = vec_splat (valgrad, 1); vec_ste (tmp0, 0, &(grad[0]));
tmp0 = vec_splat (valgrad, 2); vec_ste (tmp0, 0, &(grad[1]));
tmp0 = vec_splat (valgrad, 3); vec_ste (tmp0, 0, &(grad[2]));
_4DOTS (d2a, bcP, a, d2bcP, a, bd2cP, da, dbcP, hess4);
tmp0 = vec_splat (hess4, 0); vec_ste (tmp0, 0, &(hess[0]));
tmp0 = vec_splat (hess4, 1); vec_ste (tmp0, 0, &(hess[4]));
tmp0 = vec_splat (hess4, 2); vec_ste (tmp0, 0, &(hess[8]));
tmp0 = vec_splat (hess4, 3); vec_ste (tmp0, 0, &(hess[1]));
_4DOTS (da, bdcP, a, dbdcP, a, a, a, a, hess4);
tmp0 = vec_splat (hess4, 0); vec_ste (tmp0, 0, &(hess[2]));
tmp0 = vec_splat (hess4, 1); vec_ste (tmp0, 0, &(hess[5]));
// Compute value
// _MM_DOT4_PS (a, bcP, val);
// // Compute gradient
// _MM_DOT4_PS (da, bcP, &(grad[0]));
// _MM_DOT4_PS (a, dbcP, &(grad[1]));
// _MM_DOT4_PS (a, bdcP, &(grad[2]));
// // Compute hessian
// _MM_DOT4_PS (d2a, bcP, &(hess[0]));
// _MM_DOT4_PS (a, d2bcP, &(hess[4]));
// _MM_DOT4_PS (a, bd2cP, &(hess[8]));
// _MM_DOT4_PS (da, dbcP, &(hess[1]));
// _MM_DOT4_PS (da, bdcP, &(hess[2]));
// _MM_DOT4_PS (a, dbdcP, &(hess[5]));
// Multiply gradients and hessians by appropriate grid inverses
float dxInv = spline->x_grid.delta_inv;
float dyInv = spline->y_grid.delta_inv;
float dzInv = spline->z_grid.delta_inv;
grad[0] *= dxInv;
grad[1] *= dyInv;
grad[2] *= dzInv;
hess[0] *= dxInv*dxInv;
hess[4] *= dyInv*dyInv;
hess[8] *= dzInv*dzInv;
hess[1] *= dxInv*dyInv;
hess[2] *= dxInv*dzInv;
hess[5] *= dyInv*dzInv;
// Copy hessian elements into lower half of 3x3 matrix
hess[3] = hess[1];
hess[6] = hess[2];
hess[7] = hess[5];
#undef P
//fprintf (stderr, "%vf\n", xyz);
}
#undef _MM_MATVEC4_PS
#undef _MM_DOT4_PS
#endif

View File

@ -0,0 +1,32 @@
#ifndef BSPLINE_EVAL_CUDA_H
#define BSPLINE_EVAL_CUDA_H
#include "bspline_structs_cuda.h"
extern "C" void
eval_multi_UBspline_3d_s_cuda (UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], int num);
extern "C" void
eval_multi_UBspline_3d_s_sign_cuda (UBspline_3d_s_cuda *spline,
float *pos_d, float *sign_d,
float *vals_d[], int num);
extern "C" void
eval_multi_UBspline_3d_s_vgh_cuda (UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], float *grads_d[],
float *hess_d[], int num);
extern "C" void
eval_multi_UBspline_3d_s_vgl_cuda
(UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
extern "C" void
eval_multi_UBspline_3d_s_vgl_sign_cuda
(UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,950 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_EVAL_STD_C_H
#define BSPLINE_EVAL_STD_C_H
#include <math.h>
#include <stdio.h>
extern const float* restrict Af;
extern const float* restrict dAf;
extern const float* restrict d2Af;
/************************************************************/
/* 1D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_1d_c (UBspline_1d_c * restrict spline,
double x, complex_float* restrict val)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
}
/* Value and first derivative */
inline void
eval_UBspline_1d_c_vg (UBspline_1d_c * restrict spline, double x,
complex_float* restrict val, complex_float* restrict grad)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
float dxInv = spline->x_grid.delta_inv;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
*grad = dxInv *
(coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
}
/* Value, first derivative, and second derivative */
inline void
eval_UBspline_1d_c_vgl (UBspline_1d_c * restrict spline, double x,
complex_float* restrict val, complex_float* restrict grad,
complex_float* restrict lapl)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
float dxInv = spline->x_grid.delta_inv;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
*grad = dxInv *
(coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
*lapl = dxInv * dxInv *
(coefs[i+0]*(d2Af[ 2]*tp[2] + d2Af[ 3]*tp[3])+
coefs[i+1]*(d2Af[ 6]*tp[2] + d2Af[ 7]*tp[3])+
coefs[i+2]*(d2Af[10]*tp[2] + d2Af[11]*tp[3])+
coefs[i+3]*(d2Af[14]*tp[2] + d2Af[15]*tp[3]));
}
inline void
eval_UBspline_1d_c_vgh (UBspline_1d_c * restrict spline, double x,
complex_float* restrict val,
complex_float* restrict grad,
complex_float* restrict hess)
{
eval_UBspline_1d_c_vgl (spline, x, val, grad, hess);
}
/************************************************************/
/* 2D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_2d_c (UBspline_2d_c * restrict spline,
double x, double y, complex_float* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value and gradient */
inline void
eval_UBspline_2d_c_vg (UBspline_2d_c * restrict spline,
double x, double y,
complex_float* restrict val, complex_float* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4], da[4], db[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = (dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = (dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = (dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = (dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
int xs = spline->x_stride;
float dxInv = spline->x_grid.delta_inv;
float dyInv = spline->y_grid.delta_inv;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = dxInv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = dyInv *
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
#undef C
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_2d_c_vgl (UBspline_2d_c * restrict spline,
double x, double y, complex_float* restrict val,
complex_float* restrict grad, complex_float* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
int xs = spline->x_stride;
float dxInv = spline->x_grid.delta_inv;
float dyInv = spline->y_grid.delta_inv;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = dxInv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = dyInv*
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
*lapl =
dyInv * dyInv *
(a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) +
dxInv * dxInv *
(d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_2d_c_vgh (UBspline_2d_c * restrict spline,
double x, double y, complex_float* restrict val,
complex_float* restrict grad, complex_float* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = ( dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = ( dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = ( dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = ( dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
int xs = spline->x_stride;
float dxInv = spline->x_grid.delta_inv;
float dyInv = spline->y_grid.delta_inv;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
( a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[0] = dxInv *
( da[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
da[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
da[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
da[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[1] = dyInv *
( a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[0] = dxInv * dxInv *
(d2a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
d2a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
d2a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
d2a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
hess[1] = dxInv * dyInv *
( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[3] = dyInv * dyInv *
( a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
hess[2] = hess[1];
#undef C
}
/************************************************************/
/* 3D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_3d_c (UBspline_3d_c * restrict spline,
double x, double y, double z,
complex_float* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
c[0] = (Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = (Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = (Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = (Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
*val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
#undef P
}
/* Value and gradient */
inline void
eval_UBspline_3d_c_vg (UBspline_3d_c * restrict spline,
double x, double y, double z,
complex_float* restrict val, complex_float* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4];
complex_float cP[16], bcP[4], dbcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
c[0] = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
float dxInv = spline->x_grid.delta_inv;
float dyInv = spline->y_grid.delta_inv;
float dzInv = spline->z_grid.delta_inv;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
*val = ( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = dxInv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = dyInv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = dzInv *
(a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
#undef P
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_3d_c_vgl (UBspline_3d_c * restrict spline,
double x, double y, double z,
complex_float* restrict val, complex_float* restrict grad,
complex_float* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4];
complex_float cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
c[0] = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
float dxInv = spline->x_grid.delta_inv;
float dyInv = spline->y_grid.delta_inv;
float dzInv = spline->z_grid.delta_inv;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
*val =
( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = dxInv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = dyInv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = dzInv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
*lapl =
dxInv * dxInv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+ dyInv * dyInv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+ dzInv * dzInv *
(a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+
b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
#undef P
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_3d_c_vgh (UBspline_3d_c * restrict spline,
double x, double y, double z,
complex_float* restrict val, complex_float* restrict grad,
complex_float* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4];
complex_float cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
c[0] = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
float dxInv = spline->x_grid.delta_inv;
float dyInv = spline->y_grid.delta_inv;
float dzInv = spline->z_grid.delta_inv;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
*val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
grad[0] = dxInv *
(da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = dyInv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = dzInv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
// d2x
hess[0] = dxInv * dxInv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
// dx dy
hess[1] = dxInv * dyInv *
(da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
hess[3] = hess[1];
// dx dz;
hess[2] = dxInv * dzInv *
(da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
hess[6] = hess[2];
// d2y
hess[4] = dyInv * dyInv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
// dy dz
hess[5] = dyInv * dzInv *
(a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
hess[7] = hess[5];
// d2z
hess[8] = dzInv * dzInv *
(a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
#undef P
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,932 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_EVAL_STD_D_H
#define BSPLINE_EVAL_STD_D_H
#include <math.h>
#include <stdio.h>
extern const double* restrict Ad;
extern const double* restrict dAd;
extern const double* restrict d2Ad;
/************************************************************/
/* 1D double-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_1d_d (UBspline_1d_d * restrict spline,
double x, double* restrict val)
{
x -= spline->x_grid.start;
double u = x*spline->x_grid.delta_inv;
double ipart, t;
t = modf (u, &ipart);
int i = (int) ipart;
double tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
double* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
}
/* Value and first derivative */
inline void
eval_UBspline_1d_d_vg (UBspline_1d_d * restrict spline, double x,
double* restrict val, double* restrict grad)
{
x -= spline->x_grid.start;
double u = x*spline->x_grid.delta_inv;
double ipart, t;
t = modf (u, &ipart);
int i = (int) ipart;
double tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
double* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
}
/* Value, first derivative, and second derivative */
inline void
eval_UBspline_1d_d_vgl (UBspline_1d_d * restrict spline, double x,
double* restrict val, double* restrict grad,
double* restrict lapl)
{
x -= spline->x_grid.start;
double u = x*spline->x_grid.delta_inv;
double ipart, t;
t = modf (u, &ipart);
int i = (int) ipart;
double tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
double* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
*lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(coefs[i+0]*(d2Ad[ 2]*tp[2] + d2Ad[ 3]*tp[3])+
coefs[i+1]*(d2Ad[ 6]*tp[2] + d2Ad[ 7]*tp[3])+
coefs[i+2]*(d2Ad[10]*tp[2] + d2Ad[11]*tp[3])+
coefs[i+3]*(d2Ad[14]*tp[2] + d2Ad[15]*tp[3]));
}
inline void
eval_UBspline_1d_d_vgh (UBspline_1d_d * restrict spline, double x,
double* restrict val, double* restrict grad,
double* restrict hess)
{
eval_UBspline_1d_d_vgl (spline, x, val, grad, hess);
}
/************************************************************/
/* 2D double-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_2d_d (UBspline_2d_d * restrict spline,
double x, double y, double* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value and gradient */
inline void
eval_UBspline_2d_d_vg (UBspline_2d_d * restrict spline,
double x, double y,
double* restrict val, double* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4], da[4], db[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = (dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = (dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = (dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = (dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = spline->x_grid.delta_inv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = spline->y_grid.delta_inv *
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
#undef C
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_2d_d_vgl (UBspline_2d_d * restrict spline,
double x, double y, double* restrict val,
double* restrict grad, double* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = spline->x_grid.delta_inv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = spline->y_grid.delta_inv *
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
*lapl =
spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) +
spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_2d_d_vgh (UBspline_2d_d * restrict spline,
double x, double y, double* restrict val,
double* restrict grad, double* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = ( dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = ( dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = ( dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = ( dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
( a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[0] = spline->x_grid.delta_inv *
( da[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
da[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
da[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
da[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[1] = spline->y_grid.delta_inv *
( a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
d2a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
d2a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
d2a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[3] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
( a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
hess[2] = hess[1];
#undef C
}
/************************************************************/
/* 3D double-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_3d_d (UBspline_3d_d * restrict spline,
double x, double y, double z,
double* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
c[0] = (Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = (Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = (Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = (Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
*val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
#undef P
}
/* Value and gradient */
inline void
eval_UBspline_3d_d_vg (UBspline_3d_d * restrict spline,
double x, double y, double z,
double* restrict val, double* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
cP[16], bcP[4], dbcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
c[0] = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
*val = ( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = spline->x_grid.delta_inv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
#undef P
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_3d_d_vgl (UBspline_3d_d * restrict spline,
double x, double y, double z,
double* restrict val, double* restrict grad, double* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4], cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
c[0] = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
*val =
( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = spline->x_grid.delta_inv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
*lapl =
spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+ spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+ spline->z_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+
b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
#undef P
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_3d_d_vgh (UBspline_3d_d * restrict spline,
double x, double y, double z,
double* restrict val, double* restrict grad, double* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
// if ((ix >= spline->x_grid.num)) x = spline->x_grid.num;
// if ((ix < 0)) x = 0;
// if ((iy >= spline->y_grid.num)) y = spline->y_grid.num;
// if ((iy < 0)) y = 0;
// if ((iz >= spline->z_grid.num)) z = spline->z_grid.num;
// if ((iz < 0)) z = 0;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4], cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
c[0] = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
// if (offmax > spline->coef_size) {
// fprintf (stderr, "Outside bounds in spline evalutation.\n"
// "offmax = %d csize = %d\n", offmax, spline->csize);
// fprintf (stderr, "ix=%d iy=%d iz=%d\n", ix,iy,iz);
// }
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
*val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
grad[0] = spline->x_grid.delta_inv *
(da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
// d2x
hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
// dx dy
hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
(da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
hess[3] = hess[1];
// dx dz;
hess[2] = spline->x_grid.delta_inv * spline->z_grid.delta_inv *
(da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
hess[6] = hess[2];
// d2y
hess[4] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
// dy dz
hess[5] = spline->y_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
hess[7] = hess[5];
// d2z
hess[8] = spline->z_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
#undef P
}
#endif

View File

@ -0,0 +1,931 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_EVAL_STD_S_H
#define BSPLINE_EVAL_STD_S_H
#include <math.h>
#include <stdio.h>
extern const float* restrict Af;
extern const float* restrict dAf;
extern const float* restrict d2Af;
/************************************************************/
/* 1D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_1d_s (UBspline_1d_s * restrict spline,
double x, float* restrict val)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
float* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
}
/* Value and first derivative */
inline void
eval_UBspline_1d_s_vg (UBspline_1d_s * restrict spline, double x,
float* restrict val, float* restrict grad)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
float* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
}
/* Value, first derivative, and second derivative */
inline void
eval_UBspline_1d_s_vgl (UBspline_1d_s * restrict spline, double x,
float* restrict val, float* restrict grad,
float* restrict lapl)
{
x -= spline->x_grid.start;
float u = x*spline->x_grid.delta_inv;
float ipart, t;
t = modff (u, &ipart);
int i = (int) ipart;
float tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
float* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
*lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(coefs[i+0]*(d2Af[ 2]*tp[2] + d2Af[ 3]*tp[3])+
coefs[i+1]*(d2Af[ 6]*tp[2] + d2Af[ 7]*tp[3])+
coefs[i+2]*(d2Af[10]*tp[2] + d2Af[11]*tp[3])+
coefs[i+3]*(d2Af[14]*tp[2] + d2Af[15]*tp[3]));
}
inline void
eval_UBspline_1d_s_vgh (UBspline_1d_s * restrict spline, double x,
float* restrict val, float* restrict grad,
float* restrict hess)
{
eval_UBspline_1d_s_vgl (spline, x, val, grad, hess);
}
/************************************************************/
/* 2D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_2d_s (UBspline_2d_s * restrict spline,
double x, double y, float* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value and gradient */
inline void
eval_UBspline_2d_s_vg (UBspline_2d_s * restrict spline,
double x, double y,
float* restrict val, float* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4], da[4], db[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = (dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = (dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = (dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = (dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = spline->x_grid.delta_inv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = spline->y_grid.delta_inv *
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
#undef C
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_2d_s_vgl (UBspline_2d_s * restrict spline,
double x, double y, float* restrict val,
float* restrict grad, float* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = spline->x_grid.delta_inv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = spline->y_grid.delta_inv *
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
*lapl =
spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) +
spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_2d_s_vgh (UBspline_2d_s * restrict spline,
double x, double y, float* restrict val,
float* restrict grad, float* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float ipartx, iparty, tx, ty;
tx = modff (ux, &ipartx);
ty = modff (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = ( dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = ( dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = ( dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = ( dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
( a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[0] = spline->x_grid.delta_inv *
( da[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
da[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
da[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
da[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[1] = spline->y_grid.delta_inv *
( a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
d2a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
d2a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
d2a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[3] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
( a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
hess[2] = hess[1];
#undef C
}
/************************************************************/
/* 3D single-precision, real evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_3d_s (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
c[0] = (Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = (Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = (Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = (Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
*val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
#undef P
}
/* Value and gradient */
inline void
eval_UBspline_3d_s_vg (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val, float* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
cP[16], bcP[4], dbcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
c[0] = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
*val = ( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = spline->x_grid.delta_inv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
#undef P
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_3d_s_vgl (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val, float* restrict grad, float* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4], cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
c[0] = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
*val =
( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = spline->x_grid.delta_inv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
*lapl =
spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+ spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+ spline->z_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+
b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
#undef P
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_3d_s_vgh (UBspline_3d_s * restrict spline,
double x, double y, double z,
float* restrict val, float* restrict grad, float* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
float ux = x*spline->x_grid.delta_inv;
float uy = y*spline->y_grid.delta_inv;
float uz = z*spline->z_grid.delta_inv;
ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
float ipartx, iparty, ipartz, tx, ty, tz;
tx = modff (ux, &ipartx); int ix = (int) ipartx;
ty = modff (uy, &iparty); int iy = (int) iparty;
tz = modff (uz, &ipartz); int iz = (int) ipartz;
// if ((ix >= spline->x_grid.num)) x = spline->x_grid.num;
// if ((ix < 0)) x = 0;
// if ((iy >= spline->y_grid.num)) y = spline->y_grid.num;
// if ((iy < 0)) y = 0;
// if ((iz >= spline->z_grid.num)) z = spline->z_grid.num;
// if ((iz < 0)) z = 0;
float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4], cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
float* restrict coefs = spline->coefs;
a[0] = ( Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
a[1] = ( Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
a[2] = ( Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
a[3] = ( Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
da[0] = ( dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
da[1] = ( dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
da[2] = ( dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
da[3] = ( dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
b[0] = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
b[1] = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
b[2] = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
b[3] = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
c[0] = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
c[1] = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
c[2] = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
c[3] = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
// if (offmax > spline->coef_size) {
// fprintf (stderr, "Outside bounds in spline evalutation.\n"
// "offmax = %d csize = %d\n", offmax, spline->csize);
// fprintf (stderr, "ix=%d iy=%d iz=%d\n", ix,iy,iz);
// }
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
*val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
grad[0] = spline->x_grid.delta_inv *
(da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
// d2x
hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
// dx dy
hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
(da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
hess[3] = hess[1];
// dx dz;
hess[2] = spline->x_grid.delta_inv * spline->z_grid.delta_inv *
(da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
hess[6] = hess[2];
// d2y
hess[4] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
// dy dz
hess[5] = spline->y_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
hess[7] = hess[5];
// d2z
hess[8] = spline->z_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
#undef P
}
#endif

View File

@ -0,0 +1,939 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_EVAL_STD_Z_H
#define BSPLINE_EVAL_STD_Z_H
#include <math.h>
#include <stdio.h>
extern const double* restrict Ad;
extern const double* restrict dAd;
extern const double* restrict d2Ad;
/************************************************************/
/* 1D double-precision, complex evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_1d_z (UBspline_1d_z * restrict spline,
double x, complex_double* restrict val)
{
x -= spline->x_grid.start;
double u = x*spline->x_grid.delta_inv;
double ipart, t;
t = modf (u, &ipart);
int i = (int) ipart;
double tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
}
/* Value and first derivative */
inline void
eval_UBspline_1d_z_vg (UBspline_1d_z * restrict spline, double x,
complex_double* restrict val,
complex_double* restrict grad)
{
x -= spline->x_grid.start;
double u = x*spline->x_grid.delta_inv;
double ipart, t;
t = modf (u, &ipart);
int i = (int) ipart;
double tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
}
/* Value, first derivative, and second derivative */
inline void
eval_UBspline_1d_z_vgl (UBspline_1d_z * restrict spline, double x,
complex_double* restrict val, complex_double* restrict grad,
complex_double* restrict lapl)
{
x -= spline->x_grid.start;
double u = x*spline->x_grid.delta_inv;
double ipart, t;
t = modf (u, &ipart);
int i = (int) ipart;
double tp[4];
tp[0] = t*t*t; tp[1] = t*t; tp[2] = t; tp[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
*val =
(coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
*grad = spline->x_grid.delta_inv *
(coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
*lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(coefs[i+0]*(d2Ad[ 2]*tp[2] + d2Ad[ 3]*tp[3])+
coefs[i+1]*(d2Ad[ 6]*tp[2] + d2Ad[ 7]*tp[3])+
coefs[i+2]*(d2Ad[10]*tp[2] + d2Ad[11]*tp[3])+
coefs[i+3]*(d2Ad[14]*tp[2] + d2Ad[15]*tp[3]));
}
inline void
eval_UBspline_1d_z_vgh (UBspline_1d_z * restrict spline, double x,
complex_double* restrict val,
complex_double* restrict grad,
complex_double* restrict hess)
{
eval_UBspline_1d_z_vgh (spline, x, val, grad, hess);
}
/************************************************************/
/* 2D double-precision, complex evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_2d_z (UBspline_2d_z * restrict spline,
double x, double y, complex_double* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value and gradient */
inline void
eval_UBspline_2d_z_vg (UBspline_2d_z * restrict spline,
double x, double y,
complex_double* restrict val,
complex_double* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4], da[4], db[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = (dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = (dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = (dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = (dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = spline->x_grid.delta_inv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = spline->y_grid.delta_inv *
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
#undef C
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_2d_z_vgl (UBspline_2d_z * restrict spline,
double x, double y, complex_double* restrict val,
complex_double* restrict grad,
complex_double* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
(a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[0] = spline->x_grid.delta_inv *
(da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
grad[1] = spline->y_grid.delta_inv *
(a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
*lapl =
spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) +
spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
#undef C
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_2d_z_vgh (UBspline_2d_z * restrict spline,
double x, double y, complex_double* restrict val,
complex_double* restrict grad,
complex_double* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double ipartx, iparty, tx, ty;
tx = modf (ux, &ipartx);
ty = modf (uy, &iparty);
int ix = (int) ipartx;
int iy = (int) iparty;
double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = ( dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = ( dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = ( dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = ( dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
int xs = spline->x_stride;
#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
*val =
( a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[0] = spline->x_grid.delta_inv *
( da[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
da[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
da[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
da[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
grad[1] = spline->y_grid.delta_inv *
( a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*(C(0,0)* b[0]+C(0,1)* b[1]+C(0,2)* b[2]+C(0,3)* b[3])+
d2a[1]*(C(1,0)* b[0]+C(1,1)* b[1]+C(1,2)* b[2]+C(1,3)* b[3])+
d2a[2]*(C(2,0)* b[0]+C(2,1)* b[1]+C(2,2)* b[2]+C(2,3)* b[3])+
d2a[3]*(C(3,0)* b[0]+C(3,1)* b[1]+C(3,2)* b[2]+C(3,3)* b[3]));
hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
hess[3] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
( a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
hess[2] = hess[1];
#undef C
}
/************************************************************/
/* 3D double-precision, complex evaulation functions */
/************************************************************/
/* Value only */
inline void
eval_UBspline_3d_z (UBspline_3d_z * restrict spline,
double x, double y, double z,
complex_double* restrict val)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
c[0] = (Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = (Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = (Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = (Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
*val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
#undef P
}
/* Value and gradient */
inline void
eval_UBspline_3d_z_vg (UBspline_3d_z * restrict spline,
double x, double y, double z,
complex_double* restrict val,
complex_double* restrict grad)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4];
complex_double cP[16], bcP[4], dbcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
c[0] = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
*val = ( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = spline->x_grid.delta_inv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
#undef P
}
/* Value, gradient, and laplacian */
inline void
eval_UBspline_3d_z_vgl (UBspline_3d_z * restrict spline,
double x, double y, double z,
complex_double* restrict val,
complex_double* restrict grad,
complex_double* restrict lapl)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4];
complex_double cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
c[0] = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
*val =
( a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3]);
grad[0] = spline->x_grid.delta_inv *
(da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
*lapl =
spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+ spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+ spline->z_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+
b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
#undef P
}
/* Value, gradient, and Hessian */
inline void
eval_UBspline_3d_z_vgh (UBspline_3d_z * restrict spline,
double x, double y, double z,
complex_double* restrict val,
complex_double* restrict grad,
complex_double* restrict hess)
{
x -= spline->x_grid.start;
y -= spline->y_grid.start;
z -= spline->z_grid.start;
double ux = x*spline->x_grid.delta_inv;
double uy = y*spline->y_grid.delta_inv;
double uz = z*spline->z_grid.delta_inv;
ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
double ipartx, iparty, ipartz, tx, ty, tz;
tx = modf (ux, &ipartx); int ix = (int) ipartx;
ty = modf (uy, &iparty); int iy = (int) iparty;
tz = modf (uz, &ipartz); int iz = (int) ipartz;
// if ((ix >= spline->x_grid.num)) x = spline->x_grid.num;
// if ((ix < 0)) x = 0;
// if ((iy >= spline->y_grid.num)) y = spline->y_grid.num;
// if ((iy < 0)) y = 0;
// if ((iz >= spline->z_grid.num)) z = spline->z_grid.num;
// if ((iz < 0)) z = 0;
double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4],
d2a[4], d2b[4], d2c[4];
complex_double cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
tpx[0] = tx*tx*tx; tpx[1] = tx*tx; tpx[2] = tx; tpx[3] = 1.0;
tpy[0] = ty*ty*ty; tpy[1] = ty*ty; tpy[2] = ty; tpy[3] = 1.0;
tpz[0] = tz*tz*tz; tpz[1] = tz*tz; tpz[2] = tz; tpz[3] = 1.0;
complex_double* restrict coefs = spline->coefs;
a[0] = ( Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
a[1] = ( Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
a[2] = ( Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
a[3] = ( Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
da[0] = ( dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
da[1] = ( dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
da[2] = ( dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
da[3] = ( dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
b[0] = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
b[1] = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
b[2] = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
b[3] = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
c[0] = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
c[1] = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
c[2] = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
c[3] = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
int xs = spline->x_stride;
int ys = spline->y_stride;
int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
// if (offmax > spline->coef_size) {
// fprintf (stderr, "Outside bounds in spline evalutation.\n"
// "offmax = %d csize = %d\n", offmax, spline->csize);
// fprintf (stderr, "ix=%d iy=%d iz=%d\n", ix,iy,iz);
// }
#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
*val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
grad[0] = spline->x_grid.delta_inv *
(da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
grad[1] = spline->y_grid.delta_inv *
(a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
grad[2] = spline->z_grid.delta_inv *
(a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
// d2x
hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
(d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
// dx dy
hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
(da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
hess[3] = hess[1];
// dx dz;
hess[2] = spline->x_grid.delta_inv * spline->z_grid.delta_inv *
(da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
hess[6] = hess[2];
// d2y
hess[4] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
(a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
// dy dz
hess[5] = spline->y_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
hess[7] = hess[5];
// d2z
hess[8] = spline->z_grid.delta_inv * spline->z_grid.delta_inv *
(a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
#undef P
}
#endif

View File

@ -0,0 +1,158 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef BSPLINE_STRUCTS_STD_H
#define BSPLINE_STRUCTS_STD_H
///////////////////////////
// Single precision real //
///////////////////////////
typedef struct
{
spline_code spcode;
type_code tcode;
float* restrict coefs;
Ugrid x_grid;
BCtype_s xBC;
} UBspline_1d_s;
typedef struct
{
spline_code spcode;
type_code tcode;
float* restrict coefs;
int x_stride;
Ugrid x_grid, y_grid;
BCtype_s xBC, yBC;
} UBspline_2d_s;
typedef struct
{
spline_code spcode;
type_code tcode;
float* restrict coefs;
int x_stride, y_stride;
Ugrid x_grid, y_grid, z_grid;
BCtype_s xBC, yBC, zBC;
} UBspline_3d_s;
///////////////////////////
// Double precision real //
///////////////////////////
typedef struct
{
spline_code spcode;
type_code tcode;
double* restrict coefs;
Ugrid x_grid;
BCtype_d xBC;
} UBspline_1d_d;
typedef struct
{
spline_code spcode;
type_code tcode;
double* restrict coefs;
int x_stride;
Ugrid x_grid, y_grid;
BCtype_d xBC, yBC;
} UBspline_2d_d;
typedef struct
{
spline_code spcode;
type_code tcode;
double* restrict coefs;
int x_stride, y_stride;
Ugrid x_grid, y_grid, z_grid;
BCtype_d xBC, yBC, zBC;
} UBspline_3d_d;
//////////////////////////////
// Single precision complex //
//////////////////////////////
typedef struct
{
spline_code spcode;
type_code tcode;
complex_float* restrict coefs;
Ugrid x_grid;
BCtype_c xBC;
} UBspline_1d_c;
typedef struct
{
spline_code spcode;
type_code tcode;
complex_float* restrict coefs;
int x_stride;
Ugrid x_grid, y_grid;
BCtype_c xBC, yBC;
} UBspline_2d_c;
typedef struct
{
spline_code spcode;
type_code tcode;
complex_float* restrict coefs;
int x_stride, y_stride;
Ugrid x_grid, y_grid, z_grid;
BCtype_c xBC, yBC, zBC;
} UBspline_3d_c;
//////////////////////////////
// Double precision complex //
//////////////////////////////
typedef struct
{
spline_code spcode;
type_code tcode;
complex_double* restrict coefs;
Ugrid x_grid;
BCtype_z xBC;
} UBspline_1d_z;
typedef struct
{
spline_code spcode;
type_code tcode;
complex_double* restrict coefs;
int x_stride;
Ugrid x_grid, y_grid;
BCtype_z xBC, yBC;
} UBspline_2d_z;
typedef struct
{
spline_code spcode;
type_code tcode;
complex_double* restrict coefs;
int x_stride, y_stride;
Ugrid x_grid, y_grid, z_grid;
BCtype_z xBC, yBC, zBC;
} UBspline_3d_z;
#endif

View File

@ -0,0 +1,90 @@
#ifndef BSPLINE_STRUCTS_CUDA_H
#define BSPLINE_STRUCTS_CUDA_H
#include "bspline_base_cuda.h"
#define SPLINE_BLOCK_SIZE 64
////////
// 2D //
////////
#if CUDA_VERSION < 3000
typedef struct
{
double x,y,z;
} double3;
typedef struct
{
double x,y,z,w;
} double4;
#endif
typedef struct
{
float *coefs;
uint2 stride;
float2 gridInv;
} UBspline_2d_s_cuda;
typedef struct
{
float *coefs_real, *coefs_imag;
uint2 stride;
float2 gridInv;
} UBspline_2d_c_cuda;
typedef struct
{
double *coefs;
uint2 stride;
double gridInv[2];
} UBspline_2d_d_cuda;
typedef struct
{
complex_double *coefs;
uint2 stride;
double gridInv[2];
} UBspline_2d_z_cuda;
////////
// 3D //
////////
typedef struct
{
float *coefs;
uint3 stride;
float3 gridInv;
uint3 dim;
} UBspline_3d_s_cuda;
typedef struct
{
complex_float *coefs;
uint3 stride;
float3 gridInv;
uint3 dim;
} UBspline_3d_c_cuda;
typedef struct
{
double *coefs;
uint3 stride;
double3 gridInv;
uint3 dim;
} UBspline_3d_d_cuda;
typedef struct
{
complex_double *coefs;
uint3 stride;
double3 gridInv;
uint3 dim;
} UBspline_3d_z_cuda;
#endif

View File

@ -0,0 +1,86 @@
//
//See the LICENSE file in the top-level directory for copyright notices
//
#ifndef EINSPLINE_CONFIGURATION_H
#define EINSPLINE_CONFIGURATION_H
/* Define to 1 if you have fftw */
#cmakedefine HAVE_LIBFFTW @HAVE_LIBFFTW@
/* Define if sincos function exists */
#cmakedefine HAVE_SINCOS @HAVE_SINCOS@
/* Define if std::round function exists */
#cmakedefine HAVE_STD_ROUND @HAVE_STD_ROUND@
/* Define if floor function exists */
#cmakedefine HAVE_FLOOR @HAVE_FLOOR@
/* Define if posix_memalign function exists */
#cmakedefine HAVE_POSIX_MEMALIGN @HAVE_POSIX_MEMALIGN@
/* Define if pow function exists */
#cmakedefine HAVE_POW @HAVE_POW@
/* Define if sqrt function exists */
#cmakedefine HAVE_SQRT @HAVE_SQRT@
/* Define if dlfcn.h exists */
#cmakedefine HAVE_DLFCN_H @HAVE_DLFCN_H@
/* Define if inttypes.h exists */
#cmakedefine HAVE_INTTYPES_H @HAVE_INTTYPES_H@
/* Define if memory.h exists */
#cmakedefine HAVE_MEMORY_H @HAVE_MEMORY_H@
/* Define if pmmintrin.h exists */
#cmakedefine HAVE_PMMINTRIN_H @HAVE_PMMINTRIN_H@
/* Define if emmintrin.h exists */
#cmakedefine HAVE_EMMINTRIN_H @HAVE_EMMINTRIN_H@
/* Define if sys/stat.h exists */
#cmakedefine HAVE_SYS_STAT_H @HAVE_SYS_STAT_H@
/* Define if sys/time.h exists */
#cmakedefine HAVE_SYS_TIME_H @HAVE_SYS_TIME_H@
/* Define if sys/types.h exists */
#cmakedefine HAVE_SYS_TYPES_H @HAVE_SYS_TYPES_H@
/* Define if unistd.h exists */
#cmakedefine HAVE_UNISTD_H @HAVE_UNISTD_H@
/* Define if mmx support exists */
#cmakedefine HAVE_MMX @HAVE_MMX@
/* Define if sse support exists */
#cmakedefine HAVE_SSE @HAVE_SSE@
/* Define if sse2 support exists */
#cmakedefine HAVE_SSE2 @HAVE_SSE2@
/* Define if sse3 support exists */
#cmakedefine HAVE_SSE3 @HAVE_SSE3@
/* Define if ssse3 support exists */
#cmakedefine HAVE_SSSE3 @HAVE_SSSE3@
/* Define if c variable array support exists */
#cmakedefine HAVE_C_VARARRAYS @HAVE_C_VARARRAYS@
/* Prefetch loop lead distance */
#cmakedefine PREFETCH_AHEAD @PREFETCH_AHEAD@
/* Use SSE prefetch */
#cmakedefine USE_PREFETCH @USE_PREFETCH@
///* Define to `__inline__' or `__inline' if that's what the C compiler
// calls it, or to nothing if 'inline' is not supported under any name. */
//#ifndef __cplusplus
//#undef inline
//#endif
#endif

View File

@ -0,0 +1,454 @@
#include "cuda_walker.h"
#include "determinant_update.h"
#include <unistd.h>
cuda_determinant::cuda_determinant() :
N(0), A(NULL), Ainv(NULL), Ainv_delta(NULL), Ainv_colk(0),
new_row(NULL), delta(0)
{
};
cuda_determinant::cuda_determinant(int n)
{
resize(N);
}
void
cuda_determinant::resize(int n)
{
N = n;
cudaMalloc((void**)&A , N*N*sizeof(float));
cudaMalloc((void**)&Ainv , N*N*sizeof(float));
cudaMalloc((void**)&Ainv_delta, 1*N*sizeof(float));
cudaMalloc((void**)&Ainv_colk , 1*N*sizeof(float));
cudaMalloc((void**)&new_row , 1*N*sizeof(float));
cudaMalloc((void**)&delta , 1*N*sizeof(float));
}
void
cuda_walker::resize(int nup, int ndown)
{
N[0] = nup; N[1] = ndown;
dets[0].resize(N[0]);
dets[1].resize(N[1]);
}
cuda_population::cuda_population() : MaxPop(1000)
{
A_vec.resize(MaxPop);
Ainv_vec.resize(MaxPop);
delta_vec.resize(MaxPop);
Ainv_delta_vec.resize(MaxPop);
Ainv_colk_vec.resize(MaxPop);
ratio_vec.resize(MaxPop);
pos_vec.resize(3*MaxPop);
cudaMalloc((void**) &A_list_d, MaxPop*sizeof(float*));
cudaMalloc((void**) &Ainv_list_d, MaxPop*sizeof(float*));
cudaMalloc((void**) &Ainv_delta_list_d, MaxPop*sizeof(float*));
cudaMalloc((void**) &Ainv_colk_list_d, MaxPop*sizeof(float*));
cudaMalloc((void**) &delta_list_d, MaxPop*sizeof(float*));
cudaMalloc((void**) &ratios_d, MaxPop*sizeof(float));
cudaMalloc((void**) &pos_d, 4*MaxPop*sizeof(float));
}
__global__ static void
update_inverse_cuda1 (float *A_g[], float *Ainv_g[], float *u_g[], float *Ainv_delta_g[],
float *Ainv_colk_g[], int N, int rowstride, int k);
__global__ static void
update_inverse_cuda2 (float *Ainv_g[], float *u_g[], float *Ainv_delta_g[],
float *Ainv_colk_g[], int N, int rowstride, int k);
void
cuda_population::calc_new_row(int elec)
{
int detnum = (elec < num_elecs[0]) ? 0 : 1;
int N = num_elecs[detnum];
for (int wi=0; wi<walkers.size(); wi++) {
cuda_walker &w = walkers[wi];
cuda_determinant &det = w.dets[detnum];
pos_vec[4*wi+0] = w.R[3*elec+0];
pos_vec[4*wi+1] = w.R[3*elec+1];
pos_vec[4*wi+2] = w.R[3*elec+2];
delta_vec[wi] = det.delta;
}
cudaMemcpy(pos_d, &(pos_vec[0]), 4*walkers.size()*sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(delta_list_d, &(delta_vec[0]), walkers.size()*sizeof(float*),
cudaMemcpyHostToDevice);
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid (N/SPLINE_BLOCK_SIZE, walkers.size());
eval_multi_multi_UBspline_3d_s_cuda<<<dimGrid,dimBlock>>>
(pos_d, multi_spline->gridInv, multi_spline->coefs,
delta_list_d, multi_spline->stride);
}
void
cuda_population::update_determinants(int elec)
{
int index=0;
int detnum = (elec < num_elecs[0]) ? 0 : 1;
int N = num_elecs[detnum];
int row = (elec < num_elecs[0]) ? elec : elec - num_elecs[0];
for (int wi=0; wi<walkers.size(); wi++) {
cuda_walker &w = walkers[wi];
cuda_determinant &det = w.dets[detnum];
if (w.accept) {
A_vec[index] = det.A;
Ainv_vec[index] = det.Ainv;
Ainv_delta_vec[index] = det.Ainv_delta;
Ainv_colk_vec[index] = det.Ainv_colk;
delta_vec[index] = det.delta;
index++;
}
}
int num_accept = index;
cudaMemcpy (A_list_d, &(A_vec[0]),
num_accept*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy (Ainv_list_d, &(Ainv_vec[0]),
num_accept*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy (Ainv_delta_list_d, &(Ainv_delta_vec[0]),
num_accept*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy (Ainv_colk_list_d, &(Ainv_colk_vec[0]),
num_accept*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy (delta_list_d, &(delta_vec[0]),
num_accept*sizeof(float*), cudaMemcpyHostToDevice);
dim3 dimBlock(DET_BLOCK_SIZE);
dim3 dimGrid (N/DET_BLOCK_SIZE, num_accept);
update_inverse_cuda1<<<dimGrid,dimBlock>>>
(A_list_d, Ainv_list_d, delta_list_d, Ainv_delta_list_d,
Ainv_colk_list_d, N, N, row);
update_inverse_cuda2<<<dimGrid,dimBlock>>>
(Ainv_list_d, delta_list_d, Ainv_delta_list_d,
Ainv_colk_list_d, N, N, row);
};
#define RATIO_BLOCK_SIZE 128
__global__ void
calc_ratios1 (float *Ainv_list[], float *new_row_list[],
float *Ainv_tran, float *new_row_tran,
int N, int k, int row_stride, int num_mats)
{
int col = threadIdx.x + blockIdx.x*RATIO_BLOCK_SIZE;
if (col < num_mats) {
float* Ainv = Ainv_list[col];
float *new_row = new_row_list[col];
for (int row=0; row<N; row++) {
// __shared__ new_row_tran_shared[RATIO_BLOCK_SIZE];
// __shared__ Ainv_tran_shared[RATIO_BLOCK_SIZE];
new_row_tran[row_stride*row + col] = new_row[row];
Ainv_tran[row_stride*row+col] = Ainv[row*N + k];
}
}
}
__global__ void
calc_ratios (float *Ainv_list[], float *new_row_list[],
float *ratio, int N, int row_stride, int elec)
{
int tid = threadIdx.x;
int col = /*blockIdx.x*RATIO_BLOCK_SIZE * */tid;
__shared__ float *Ainv, *new_row;
if (col < N) {
if (tid == 0) {
Ainv = Ainv_list[blockIdx.x];
new_row = new_row_list[blockIdx.x];
}
__syncthreads();
__shared__ float new_row_shared[RATIO_BLOCK_SIZE];
new_row_shared[tid] = new_row[tid];
__shared__ float Ainv_colk_shared[RATIO_BLOCK_SIZE];
// This is *highly* uncoallesced, but we just have to eat it to allow
// other kernels to operate quickly.
Ainv_colk_shared[tid] = Ainv[col*row_stride + elec];
__syncthreads();
__shared__ float Ainv_new_row[RATIO_BLOCK_SIZE];
Ainv_new_row[tid] = Ainv_colk_shared[tid] * new_row_shared[tid];
__syncthreads();
// Now, we have to dot
for (unsigned int s=RATIO_BLOCK_SIZE/2; s>0; s>>=1) {
if (tid < s)
Ainv_new_row[tid] += Ainv_new_row[tid + s];
__syncthreads();
}
if (tid == 0) ratio[blockIdx.x] = Ainv_new_row[0];
}
}
__global__ void
calc_ratios2 (float *Ainv_list[], float *new_row_list[],
float *ratio, int N, int row_stride, int elec)
{
int tid = threadIdx.x;
__shared__ float *Ainv, *new_row;
if (tid == 0) {
Ainv = Ainv_list[blockIdx.x];
new_row = new_row_list[blockIdx.x];
}
__syncthreads();
int numBlocks = N/RATIO_BLOCK_SIZE;
float sum=0.0;
for (int block=0; block<numBlocks; block++) {
int row = block*RATIO_BLOCK_SIZE + tid;
__shared__ float new_row_shared[RATIO_BLOCK_SIZE];
new_row_shared[tid] = new_row[block*RATIO_BLOCK_SIZE+tid];
__syncthreads();
for (int i=0; i<RATIO_BLOCK_SIZE; i++)
if (tid==0)
sum += Ainv[row*row_stride + elec] * new_row_shared[i];
}
if (tid==0)
ratio[blockIdx.x] = sum;
}
extern "C" void
dgetrf_(int *m, int *n, double A[], int *lda, int ipiv[], int *info);
double
Determinant (double *A, int N)
{
double LU[N*N];
int ipiv[N];
int info;
for (int i=0; i<N*N; i++)
LU[i] = A[i];
// Do LU factorization
dgetrf_ (&N, &N, LU, &N, ipiv, &info);
double det = 1.0;
int numPerm = 0;
for (int i=0; i<N; i++) {
det *= LU[i*N+i];
numPerm += (ipiv[i] != (i+1));
}
if (numPerm & 1)
det *= -1.0;
return det;
}
template<typename T> void
GJInverse (T *A, int n)
{
const int maxSize = 2000;
if (n == 2) { // Special case for 2x2
T a=A[0]; T b=A[1];
T c=A[2]; T d=A[3];
T detInv = 1.0/(a*d-b*c);
A[0] = d*detInv;
A[1] = -b*detInv;
A[2] = -c*detInv;
A[3] = a*detInv;
return;
}
int colIndex[maxSize], rowIndex[maxSize], ipiv[maxSize];
T big, pivInv;
int icol, irow;
for (int j=0; j<n; j++)
ipiv[j] = -1;
for (int i=0; i<n; i++) {
big = 0.0;
for (int j=0; j<n; j++)
if (ipiv[j] != 0)
for (int k=0; k<n; k++) {
if (ipiv[k] == -1) {
if (fabs(A[n*j+k]) >= big) {
big = fabs(A[n*j+k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 0) {
fprintf (stderr, "GJInverse: Singular matrix!\n");
exit(1);
}
}
++(ipiv[icol]);
if (irow != icol)
for (int l=0; l<n; l++) {
T tmp = A[n*irow+l];
A[n*irow+l] = A[n*icol+l];
A[n*icol+l] = tmp;
// swap (A[n*irow+l], A[n*icol+l]);
}
rowIndex[i] = irow;
colIndex[i] = icol;
if (A[n*icol+icol] == 0.0) {
fprintf (stderr, "GJInverse: Singular matrix!\n");
exit(1);
}
pivInv = 1.0/A[n*icol+icol];
A[n*icol+icol] = 1.0;
for (int l=0; l<n; l++)
A[n*icol+l] *= pivInv;
for (int ll=0; ll<n; ll++)
if (ll != icol) {
T dum = A[n*ll+icol];
A[n*ll+icol] = 0.0;
for (int l=0; l<n; l++)
A[n*ll+l] -= A[n*icol+l]*dum;
}
}
// Now unscramble the permutations
for (int l=n-1; l>=0; l--) {
if (rowIndex[l] != colIndex[l])
for (int k=0; k<n ; k++) {
T tmp = A[n*k+rowIndex[l]];
A[n*k+rowIndex[l]] = A[n*k+colIndex[l]];
A[n*k+colIndex[l]] = tmp;
// swap (A(k,rowIndex[l]),A(k, colIndex[l]));
}
}
}
void
test_ratio()
{
//const int N = RATIO_BLOCK_SIZE;
const int N = 128;
const int numWalkers = 1024;
const int elec = 15;
float **AinvList, **uList;
float **AinvList_d, **uList_d, *ratio_d;
AinvList = (float**) malloc(numWalkers*sizeof(float*));
uList = (float**) malloc(numWalkers*sizeof(float*));
for (int i=0; i<numWalkers; i++) {
cudaMalloc((void**)&(AinvList[i]), N*N*sizeof(float));
cudaMalloc((void**)&(uList[i]), N*sizeof(float));
}
fprintf (stderr, "N = %d\n", N);
cudaMalloc((void**)&(AinvList_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&(uList_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&ratio_d, numWalkers*sizeof(float));
cudaMemcpy (AinvList_d, AinvList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy ( uList_d, uList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
dim3 dimBlock(RATIO_BLOCK_SIZE);
dim3 dimGrid(numWalkers);
double *A = (double*)malloc(N*N*sizeof(double));
float *Ainv = (float*) malloc(N*N*sizeof(float));
float *u = (float*) malloc(N*sizeof(float));
for (int i=0; i<N; i++) {
u[i] = drand48();
for (int j=0; j<N; j++)
A[i*N+j] = Ainv[i*N+j] = (float)drand48();
}
GJInverse(Ainv, N);
double det1 = Determinant (A, N);
for (int i=0; i<N; i++)
A[elec*N+i] = u[i];
double det2 = Determinant (A, N);
fprintf (stderr, "Host ratio = %1.8f\n", det2/det1);
for (int wi=0; wi<numWalkers; wi++) {
cudaMemcpy (AinvList[wi], Ainv, N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy (uList[wi], u, 1*N*sizeof(float), cudaMemcpyHostToDevice);
}
clock_t start = clock();
for (int i=0; i<10*N; i++)
calc_ratios<<<dimGrid,dimBlock>>> (AinvList_d, uList_d, ratio_d, N, N, elec);
clock_t end = clock();
float ratio;
cudaMemcpy (&ratio, &(ratio_d[331]), sizeof(float), cudaMemcpyDeviceToHost);
fprintf (stderr, "Device ratio = %1.8f\n", ratio);
double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
double rate = 10.0/time;
fprintf (stderr, "Rate = %1.3f generations per second.\n", rate);
}
void
test_ratio1()
{
const int N = 128;
const int numWalkers = 1024;
float **AinvList, **uList;
float **AinvList_d, **uList_d, *ratio_d;
float *Ainv_tran, *ratio_tran;
AinvList = (float**) malloc(numWalkers*sizeof(float*));
uList = (float**) malloc(numWalkers*sizeof(float*));
cudaMalloc ((void**) &Ainv_tran, N*numWalkers);
cudaMalloc ((void**) &ratio_tran, N*numWalkers);
for (int i=0; i<numWalkers; i++) {
cudaMalloc((void**)&(AinvList[i]), N*N*sizeof(float));
cudaMalloc((void**)&(uList[i]), N*sizeof(float));
}
fprintf (stderr, "N = %d\n", N);
cudaMalloc((void**)&(AinvList_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&(uList_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&ratio_d, numWalkers*sizeof(float));
cudaMemcpy (AinvList_d, AinvList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy ( uList_d, uList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
dim3 dimBlock(RATIO_BLOCK_SIZE);
dim3 dimGrid(numWalkers/RATIO_BLOCK_SIZE);
clock_t start = clock();
for (int i=0; i<10*N; i++)
calc_ratios1<<<dimGrid,dimBlock>>> (AinvList_d, uList_d, Ainv_tran, ratio_tran,
N, 1, numWalkers, numWalkers);
clock_t end = clock();
double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
double rate = 10.0/time;
fprintf (stderr, "Rate = %1.3f generations per second.\n", rate);
}
main()
{
test_ratio();
}

View File

@ -0,0 +1,55 @@
#ifndef CUDA_WALKER_H
#define CUDA_WALKER_H
#include <vector>
#include "multi_bspline_cuda_s.h"
class cuda_determinant
{
public:
int N;
float *A, *Atran, *Ainv;
float *Ainv_delta, *Ainv_colk;
float *new_row, *delta;
void resize(int N);
cuda_determinant(int N);
cuda_determinant();
};
class cuda_walker
{
public:
int N[2];
float *R;
cuda_determinant dets[2];
bool accept;
void resize(int nup, int ndown);
};
class cuda_population
{
private:
const int MaxPop;
float **A_list_d, **Ainv_list_d, **delta_list_d;
float **Ainv_delta_list_d, **Ainv_colk_list_d;
float *ratios_d;
float *pos_d;
std::vector<float*> A_vec, Ainv_vec, delta_vec,
Ainv_delta_vec, Ainv_colk_vec;
std::vector<float> ratio_vec, pos_vec;
vector<cuda_walker> walkers;
// Number of up and down electrons
int num_elecs[2];
multi_UBspline_3d_s_cuda *multi_spline;
public:
void calc_new_row (int elec);
void calc_ratios (int elec);
void update_determinants(int elec);
cuda_population();
};
#endif

View File

@ -0,0 +1,537 @@
#define BLOCK_SIZE 64
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
// The first kernel just computes Ainv * u and also stores the kth
// row of Ainv in global memory
__global__ static void
update_inverse_cuda1 (float *Ainv_g[], float *u_g[], float *AinvT_u_g[],
float *Ainv_colk_g[], int N, int rowstride, int k)
{
__shared__ float *Ainv, *u, *AinvT_u, *Ainv_colk;
if (threadIdx.x==0) {
Ainv = Ainv_g[blockIdx.y];
u = u_g[blockIdx.y];
AinvT_u = AinvT_u_g[blockIdx.y];
Ainv_colk = Ainv_colk_g[blockIdx.y];
}
__syncthreads();
// Store the product Ainv * u in shared memory
__shared__ float AinvT_u_shared[BLOCK_SIZE], Ainv_colk_shared[BLOCK_SIZE];
__shared__ float u_shared[BLOCK_SIZE];
AinvT_u_shared[threadIdx.x] = 0.0;
int col = blockIdx.x*BLOCK_SIZE + threadIdx.x;
int numblocks = N / BLOCK_SIZE;
if (blockIdx.x*BLOCK_SIZE <= k && k < (blockIdx.x+1)*BLOCK_SIZE) {
for (int block=0; block<numblocks; block++) {
u_shared[threadIdx.x] = u[block*BLOCK_SIZE+threadIdx.x];
__syncthreads();
for (int i=0; i<BLOCK_SIZE; i++) {
int row = block*BLOCK_SIZE + i;
float ainv = Ainv[row*rowstride+col];
if (col == k)
Ainv_colk_shared[i] = ainv;
AinvT_u_shared[threadIdx.x] += ainv*u_shared[i];
}
__syncthreads();
Ainv_colk[block*BLOCK_SIZE+threadIdx.x] = Ainv_colk_shared[threadIdx.x];
}
}
else {
for (int block=0; block<numblocks; block++) {
u_shared[threadIdx.x] = u[block*BLOCK_SIZE+threadIdx.x];
__syncthreads();
for (int i=0; i<BLOCK_SIZE; i++) {
int row = block*BLOCK_SIZE + i;
AinvT_u_shared[threadIdx.x] += Ainv[row*rowstride+col]*u_shared[i];
}
}
}
__syncthreads();
// Write the data back to global memory
AinvT_u[col] = AinvT_u_shared[threadIdx.x];
}
__global__ static void
update_inverse_cuda2 (float *Ainv_g[], float *u_g[], float *AinvT_u_g[],
float *Ainv_colk_g[], int N, int rowstride, int k)
{
__shared__ float *Ainv, *AinvT_u, *Ainv_colk;
if (threadIdx.x==0) {
Ainv = Ainv_g[blockIdx.y];
AinvT_u = AinvT_u_g[blockIdx.y];
Ainv_colk = Ainv_colk_g[blockIdx.y];
}
__syncthreads();
__shared__ float AinvT_u_shared[BLOCK_SIZE];
__shared__ float Ainv_colk_shared[BLOCK_SIZE];
int col = blockIdx.x*BLOCK_SIZE + threadIdx.x;
// Read the data back from global memory
AinvT_u_shared[threadIdx.x] = AinvT_u[col];
Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
__shared__ float prefact;
if (threadIdx.x == 0)
prefact = -1.0f/(1.0f+AinvT_u[k]);
__syncthreads();
int numblocks = N / BLOCK_SIZE;
for (int block=0; block<numblocks; block++) {
Ainv_colk_shared[threadIdx.x] = prefact*Ainv_colk[block*BLOCK_SIZE+threadIdx.x];
__syncthreads();
for (int i=0; i<BLOCK_SIZE; i++) {
int row = block*BLOCK_SIZE + i;
Ainv[row*rowstride+col] += AinvT_u_shared[threadIdx.x]*Ainv_colk_shared[i];
}
}
}
#define NMAX 128
__global__ static void
update_inverse_cuda (float *Ainv, float *u, int N, int rowstride, int k)
{
__shared__ float A_k[NMAX], u_shared[NMAX], Ainv_u[NMAX], Ainv_shared[NMAX];
A_k[threadIdx.x] = Ainv[k*rowstride+threadIdx.x];
u_shared[threadIdx.x] = u[threadIdx.x];
// First, compute k'th element of Ainv_u
Ainv_u[threadIdx.x] = u_shared[threadIdx.x] * A_k[threadIdx.x];
__syncthreads();
for (int n=N>>1; n>0; n = n>>1) {
float a;
if (threadIdx.x < n)
a = Ainv_u[2*threadIdx.x] + Ainv_u[2*threadIdx.x+1];
__syncthreads();
Ainv_u[threadIdx.x] = a;
__syncthreads();
}
float prefact = -1.0f/(1.0f + Ainv_u[0]);
for (int row=0; row<N; row++) {
Ainv_shared[threadIdx.x] = Ainv[row*rowstride+threadIdx.x];
__syncthreads();
Ainv_u[threadIdx.x] = u_shared[threadIdx.x] * Ainv_shared[threadIdx.x];
for (int n=N>>1; n>0; n = n>>1) {
float a;
if (threadIdx.x < n)
a = Ainv_u[2*threadIdx.x] + Ainv_u[2*threadIdx.x+1];
__syncthreads();
Ainv_u[threadIdx.x] = a;
__syncthreads();
}
__syncthreads();
// Now Ainv_u[0] has the row'th element of Ainv_u.
Ainv[row*rowstride + threadIdx.x] =
Ainv_shared[threadIdx.x] + prefact*Ainv_u[0]*A_k[threadIdx.x];
}
}
// __global__ static void
// update_inverse_cuda (float *AinvT, float *u, int N, int rowstride, int k)
// {
// // Store the product Ainv * u in shared memory
// __shared__ float Ainv_u[BLOCK_SIZE], Ainv_u_k[BLOCK_SIZE];
// Ainv_u[threadIdx.x] = 0.0;
// __syncthreads();
// for (int row=0; row < N; row++)
// Ainv_u[threadIdx.x] += AinvT[row*rowstride+threadIdx.x]*u[row];
// // Compute lambda = [A^(-1)]_k dot u
// float lambda = 0.0;
// for (int i=0; i<N; i += BLOCK_SIZE) {
// Ainv_u_k[threadIdx.x] = AinvT[i+threadIdx.x] * u[i+threadIdx.x];
// __syncthreads();
// for (int j=BLOCK_SIZE>>1; j!=0; j >>=1) {
// if (threadIdx.x < j)
// Ainv_u_k[threadIdx.x] = Ainv_u_k[2*threadIdx.x] + Ainv_u_k[2*threadIdx.x+1];
// lambda += Ainv_u_k[0];
// }
// float prefact = 1.0/(1.0+lambda);
// }
// // Now, subtract off outer product
// }
void
update_inverse (float *AinvT, float *u, int N, int k)
{
float Ainv_u[128], Ainv_rowk[128];
for (int i=0; i<N; i++) {
Ainv_u[i] = 0.0f;
Ainv_rowk[i] = AinvT[N*i+k];
for (int j=0; j<N; j++)
Ainv_u[i] += AinvT[j*N+i] * u[j];
}
float prefact = 1.0/(1.0+Ainv_u[k]);
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
AinvT[j*N+i] -= prefact * Ainv_u[i]*Ainv_rowk[j];
}
// Replaces A with its inverse by gauss-jordan elimination with full pivoting
// Adapted from Numerical Recipes in C
void GJInverse (double *A, int n)
{
const int maxSize = 2000;
if (n == 2) { // Special case for 2x2
double a=A[0]; double b=A[1];
double c=A[2]; double d=A[3];
double detInv = 1.0/(a*d-b*c);
A[0] = d*detInv;
A[1] = -b*detInv;
A[2] = -c*detInv;
A[3] = a*detInv;
return;
}
int colIndex[maxSize], rowIndex[maxSize], ipiv[maxSize];
double big, pivInv;
int icol, irow;
for (int j=0; j<n; j++)
ipiv[j] = -1;
for (int i=0; i<n; i++) {
big = 0.0;
for (int j=0; j<n; j++)
if (ipiv[j] != 0)
for (int k=0; k<n; k++) {
if (ipiv[k] == -1) {
if (fabs(A[n*j+k]) >= big) {
big = fabs(A[n*j+k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 0) {
fprintf (stderr, "GJInverse: Singular matrix!\n");
exit(1);
}
}
++(ipiv[icol]);
if (irow != icol)
for (int l=0; l<n; l++) {
double tmp = A[n*irow+l];
A[n*irow+l] = A[n*icol+l];
A[n*icol+l] = tmp;
// swap (A[n*irow+l], A[n*icol+l]);
}
rowIndex[i] = irow;
colIndex[i] = icol;
if (A[n*icol+icol] == 0.0) {
fprintf (stderr, "GJInverse: Singular matrix!\n");
exit(1);
}
pivInv = 1.0/A[n*icol+icol];
A[n*icol+icol] = 1.0;
for (int l=0; l<n; l++)
A[n*icol+l] *= pivInv;
for (int ll=0; ll<n; ll++)
if (ll != icol) {
double dum = A[n*ll+icol];
A[n*ll+icol] = 0.0;
for (int l=0; l<n; l++)
A[n*ll+l] -= A[n*icol+l]*dum;
}
}
// Now unscramble the permutations
for (int l=n-1; l>=0; l--) {
if (rowIndex[l] != colIndex[l])
for (int k=0; k<n ; k++) {
double tmp = A[n*k+rowIndex[l]];
A[n*k+rowIndex[l]] = A[n*k+colIndex[l]];
A[n*k+colIndex[l]] = tmp;
// swap (A(k,rowIndex[l]),A(k, colIndex[l]));
}
}
}
#define MAT_SIZE 128
#define NUM_MATS 1000
main()
{
int N = MAT_SIZE;
double *A, *Ainv;
int numMats = NUM_MATS;
float *Ainv_h, *u_h;
float *Ainv_d, *Ainv_u_d, *Ainv_colk_d, *u_d;
A = (double*)malloc (N*N*sizeof(double));
Ainv = (double*)malloc (N*N*sizeof(double));
Ainv_h = (float*) malloc (N*N*sizeof(float));
u_h = (float*) malloc (N*sizeof(float));
cudaMalloc((void**)&Ainv_d, N*N*sizeof(float));
cudaMalloc((void**)&Ainv_d, N*N*sizeof(float));
cudaMalloc((void**)&u_d, N*sizeof(float));
cudaMalloc((void**)&Ainv_u_d, N*sizeof(float));
cudaMalloc((void**)&Ainv_colk_d, N*sizeof(float));
float **AinvList, **Ainv_uList,
**Ainv_colkList, **uList;
AinvList = (float**)malloc(NUM_MATS*sizeof(float*));
Ainv_uList = (float**)malloc(NUM_MATS*sizeof(float*));
Ainv_colkList = (float**)malloc(NUM_MATS*sizeof(float*));
uList = (float**)malloc(NUM_MATS*sizeof(float*));
float **AinvList_d, **Ainv_uList_d, **Ainv_colkList_d, **uList_d;
cudaMalloc((void**)&AinvList_d, numMats*sizeof(float*));
cudaMalloc((void**)&Ainv_uList_d, numMats*sizeof(float*));
cudaMalloc((void**)&Ainv_colkList_d, numMats*sizeof(float*));
cudaMalloc((void**)&uList_d, numMats*sizeof(float*));
fprintf (stderr, "N = %d\n", N);
for (int mat=0; mat<numMats; mat++) {
cudaMalloc((void**)&(AinvList[mat]) , N*N*sizeof(float));
cudaMalloc((void**)&(Ainv_uList[mat]) , N*sizeof(float));
cudaMalloc((void**)&(Ainv_colkList[mat]), N*sizeof(float));
cudaMalloc((void**)&(uList[mat]) , N*sizeof(float));
}
fprintf (stderr, "N = %d\n", N);
cudaMemcpy (AinvList_d, AinvList, numMats*sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy (Ainv_uList_d, Ainv_uList, numMats*sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy (Ainv_colkList_d, Ainv_colkList, numMats*sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy (uList_d, uList, numMats*sizeof(float*),
cudaMemcpyHostToDevice);
srand48((long int) 12341313);
fprintf (stderr, "N = %d\n", N);
for (int mat=0; mat<numMats; mat++) {
if (mat == 0 ) {
for (int i=0; i<N; i++) {
u_h[i] = drand48();
for (int j=0; j<N; j++)
A[i*N+j] = Ainv[i*N+j] = drand48();
}
GJInverse(Ainv, N);
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
Ainv_h[i*N+j] = (float)Ainv[i*N+j];
}
cudaMemcpy (AinvList[mat], Ainv_h, N*N*sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy (uList[mat], u_h, N*sizeof(float), cudaMemcpyHostToDevice);
}
dim3 dimBlock2(BLOCK_SIZE);
dim3 dimGrid2(N/BLOCK_SIZE, NUM_MATS);
int row = 1;
fprintf (stderr, "Before updates.\n");
clock_t upStart = clock();
for (int i=0; i<1; i++) {
update_inverse_cuda1<<<dimGrid2,dimBlock2>>>
(AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
update_inverse_cuda2<<<dimGrid2,dimBlock2>>>
(AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
}
clock_t upEnd = clock();
double uptime = (double)(upEnd - upStart)/(double)CLOCKS_PER_SEC;
double uprate = (double)N*10*NUM_MATS/uptime;
fprintf (stderr, "%1.2f updates per second.\n", uprate);
fprintf (stderr, "%1.3f generations per second.\n", 10.0/uptime);
cudaMemcpy (Ainv_h, AinvList[1], N*N*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++)
A[row*N+i] += u_h[i];
for (int i=0; i<N; i++)
for (int j=0; j<N; j++) {
double ident = 0.0;
for (int k=0; k<N; k++)
ident += Ainv_h[i*N+k]*A[k*N+j];
if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
(i!=j && fabs(ident) > 1.0e-4))
fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
}
fprintf (stderr, "Finished.\n");
// cudaMemcpy (AinvT_h, AinvT_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);
// for (int i=0; i<N; i++) {
// u_h[i] = drand48();
// for (int j=0; j<N; j++)
// A[i*N+j] = Ainv[i*N+j] = drand48();
// }
// GJInverse(Ainv, N);
// for (int i=0; i<N; i++)
// for (int j=0; j<N; j++) {
// double ident = 0.0;
// for (int k=0; k<N; k++)
// ident += Ainv[i*N+k]*A[k*N+j];
// if ((i==j && fabs(ident - 1.0) > 1.0e-8) ||
// (i!=j && fabs(ident) > 1.0e-8))
// fprintf (stderr, "Error in matrix inverse.\n");
// }
// for (int i=0; i<N; i++)
// for (int j=0; j<N; j++) {
// AinvT_h[j*N+i] = (float)Ainv[i*N+j];
// Ainv_h[i*N+j] = (float)Ainv[i*N+j];
// }
// cudaMemcpy (Ainv_d, Ainv_h, N*N*sizeof(float), cudaMemcpyHostToDevice);
// cudaMemcpy (AinvT_d, AinvT_h, N*N*sizeof(float), cudaMemcpyHostToDevice);
// cudaMemcpy (u_d, u_h, N*sizeof(float), cudaMemcpyHostToDevice);
// int col = 1;
// update_inverse (AinvT_h, u_h, N, col);
// for (int i=0; i<N; i++)
// A[i*N+col] += u_h[i];
// for (int i=0; i<N; i++)
// for (int j=0; j<N; j++) {
// double ident = 0.0;
// for (int k=0; k<N; k++)
// ident += AinvT_h[k*N+i]*A[k*N+j];
// if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
// (i!=j && fabs(ident) > 1.0e-4))
// fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
// }
// // clock_t host_start = clock();
// // for (int i=0; i<100000; i++)
// // update_inverse (AinvT_h, u_h, N, col);
// // clock_t host_end = clock();
// // double host_time = (double)(host_end - host_start)/(double)(CLOCKS_PER_SEC);
// // double host_rate = 1.0e5/host_time;
// // fprintf (stderr, "Host rate = %1.8f updates per seconds.\n", host_rate);
// dim3 dimBlock2(BLOCK_SIZE);
// dim3 dimGrid2(N/BLOCK_SIZE);
// update_inverse_cuda1<<<dimGrid2,dimBlock2>>>
// (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
// update_inverse_cuda2<<<dimGrid2,dimBlock2>>>
// (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
// cudaMemcpy (AinvT_h, AinvT_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);
// fprintf (stderr, "2 kernel Device test: ");
// bool passed = true;
// for (int i=0; i<N; i++)
// for (int j=0; j<N; j++) {
// double ident = 0.0;
// for (int k=0; k<N; k++)
// ident += AinvT_h[k*N+i]*A[k*N+j];
// if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
// (i!=j && fabs(ident) > 1.0e-4)) {
// fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
// passed = false;
// }
// }
// if (passed)
// fprintf (stderr, "Passed.\n");
// else
// fprintf (stderr, "Failed.\n");
// dim3 dimBlock1(MAT_SIZE);
// dim3 dimGrid1(1);
// update_inverse_cuda<<<dimGrid1, dimBlock1>>> (Ainv_d, u_d, N, N, col);
// cudaMemcpy (Ainv_h, Ainv_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);
// fprintf (stderr, "1-kernel Device test: ");
// passed = true;
// for (int i=0; i<N; i++)
// for (int j=0; j<N; j++) {
// double ident = 0.0;
// for (int k=0; k<N; k++)
// //ident += AinvT_h[k*N+i]*A[k*N+j];
// ident += Ainv_h[i*N+k]*A[k*N+j];
// if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
// (i!=j && fabs(ident) > 1.0e-4)) {
// fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
// passed = false;
// }
// }
// if (passed)
// fprintf (stderr, "Passed.\n");
// else
// fprintf (stderr, "Failed.\n");
// dim3 dimGrid1000(1000);
// clock_t start = clock();
// for (int i=0; i<1000; i++)
// update_inverse_cuda<<<dimGrid1000,dimBlock1>>>
// (AinvT_d, u_d, N, N, col);
// clock_t end = clock();
// double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
// double rate = 1.0e6/time;
// fprintf (stderr, "Device rate = %1.8f updates per seconds.\n", rate);
// // dim3 dimGrid3(N/BLOCK_SIZE, 1000);
// // dim3 dimGrid4(N/BLOCK_SIZE, 1000);
// // clock_t start = clock();
// // for (int i=0; i<1000; i++) {
// // update_inverse_cuda1<<<dimGrid3,dimBlock>>>
// // (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
// // update_inverse_cuda2<<<dimGrid4,dimBlock>>>
// // (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
// // }
// // clock_t end = clock();
// // double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
// // double rate = 1.0e6/time;
// // fprintf (stderr, "Device rate = %1.8f updates per seconds.\n", rate);
}

View File

@ -0,0 +1,111 @@
#define DET_BLOCK_SIZE 64
#include <unistd.h>
#include <stdlib.h>
// The first kernel just computes AinvT * u and also stores the kth
// col of Ainv in global memory
__global__ static void
update_inverse_cuda1 (float *A_g[], float *Ainv_g[], float *u_g[],
float *Ainv_delta_g[], float *Ainv_colk_g[],
int N, int rowstride, int k)
{
__shared__ float *A, *Ainv, *u, *Ainv_delta, *Ainv_colk;
if (threadIdx.x==0) {
A = A_g[blockIdx.y];
Ainv = Ainv_g[blockIdx.y];
u = u_g[blockIdx.y];
Ainv_delta = Ainv_delta_g[blockIdx.y];
Ainv_colk = Ainv_colk_g[blockIdx.y];
}
__syncthreads();
// Store the product Ainv * u in shared memory
__shared__ float Ainv_delta_shared[DET_BLOCK_SIZE],
Ainv_colk_shared[DET_BLOCK_SIZE], u_shared[DET_BLOCK_SIZE],
uold_shared[DET_BLOCK_SIZE];
Ainv_delta_shared[threadIdx.x] = 0.0;
int col = blockIdx.x*DET_BLOCK_SIZE + threadIdx.x;
int numblocks = N / DET_BLOCK_SIZE;
// If the column I need to pull from Ainv is in this thread block
// domain, do the following
if (blockIdx.x*DET_BLOCK_SIZE <= k && k < (blockIdx.x+1)*DET_BLOCK_SIZE) {
for (int block=0; block<numblocks; block++) {
u_shared[threadIdx.x] = u[block*DET_BLOCK_SIZE+threadIdx.x];
uold_shared[threadIdx.x] =
A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x];
// Write new row into A matrix
A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x] = u_shared[threadIdx.x];
__syncthreads();
for (int i=0; i<DET_BLOCK_SIZE; i++) {
int row = block*DET_BLOCK_SIZE + i;
float a = Ainv[row*rowstride+col];
if (col == k)
Ainv_colk_shared[i] = a;
Ainv_delta_shared[threadIdx.x] += a*(u_shared[i]-uold_shared[i]);
}
__syncthreads();
Ainv_colk[block*DET_BLOCK_SIZE+threadIdx.x] = Ainv_colk_shared[threadIdx.x];
}
}
else {
for (int block=0; block<numblocks; block++) {
u_shared[threadIdx.x] = u[block*DET_BLOCK_SIZE+threadIdx.x];
uold_shared[threadIdx.x] =
A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x];
// Write new row into A matrix
A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x] = u_shared[threadIdx.x];
__syncthreads();
for (int i=0; i<DET_BLOCK_SIZE; i++) {
int row = block*DET_BLOCK_SIZE + i;
Ainv_delta_shared[threadIdx.x] +=
Ainv[row*rowstride+col]*(u_shared[i]- uold_shared[i]);
}
}
}
__syncthreads();
// Write the data back to global memory
Ainv_delta[col] = Ainv_delta_shared[threadIdx.x];
}
__global__ static void
update_inverse_cuda2 (float *Ainv_g[], float *u_g[], float *Ainv_delta_g[],
float *Ainv_colk_g[], int N, int rowstride, int k)
{
__shared__ float *Ainv, *Ainv_delta, *Ainv_colk;
if (threadIdx.x==0) {
Ainv = Ainv_g[blockIdx.y];
Ainv_delta = Ainv_delta_g[blockIdx.y];
Ainv_colk = Ainv_colk_g[blockIdx.y];
}
__syncthreads();
__shared__ float Ainv_delta_shared[DET_BLOCK_SIZE];
__shared__ float Ainv_colk_shared[DET_BLOCK_SIZE];
int col = blockIdx.x*DET_BLOCK_SIZE + threadIdx.x;
// Read the data back from global memory
Ainv_delta_shared[threadIdx.x] = Ainv_delta[col];
Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
__shared__ float prefact;
if (threadIdx.x == 0)
prefact = -1.0f/(1.0f+Ainv_delta[k]);
__syncthreads();
int numblocks = N / DET_BLOCK_SIZE;
for (int block=0; block<numblocks; block++) {
Ainv_colk_shared[threadIdx.x] =
prefact*Ainv_colk[block*DET_BLOCK_SIZE+threadIdx.x];
__syncthreads();
for (int i=0; i<DET_BLOCK_SIZE; i++) {
int row = block*DET_BLOCK_SIZE + i;
Ainv[row*rowstride+col] +=
Ainv_delta_shared[threadIdx.x]*Ainv_colk_shared[i];
}
}
}

911
src/einspline/fbspline.c Normal file
View File

@ -0,0 +1,911 @@
#include "bspline_create.h"
#include "bspline.h"
#include "fbspline.h"
#include "config.h"
#ifdef __cplusplus
#define CFUNC "C" /* Avoid name mangling in C++ */
#else
#define CFUNC
#endif
///////////////////////
// Creation routines //
///////////////////////
////////
// 1D //
////////
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_s,FCREATE_UBSPLINE_1D_S)
(double *x0, double *x1, int *num_x,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
float *data, UBspline_1d_s **spline)
{
Ugrid xgrid;
BCtype_s xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
*spline = create_UBspline_1d_s (xgrid, xBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_d,FCREATE_UBSPLINE_1D_D)
(double *x0, double *x1, int *num_x,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
double *data, UBspline_1d_d **spline)
{
Ugrid xgrid;
BCtype_d xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
*spline = create_UBspline_1d_d (xgrid, xBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_c,FCREATE_UBSPLINE_1D_C)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
complex_float *data, UBspline_1d_c **spline)
{
Ugrid xgrid;
BCtype_c xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
*spline = create_UBspline_1d_c (xgrid, xBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_z,FCREATE_UBSPLINE_1D_Z)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
complex_double *data, UBspline_1d_z **spline)
{
Ugrid xgrid;
BCtype_z xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = creal(*x0_val);
xBC.lVal_i = cimag(*x0_val);
xBC.rVal_r = creal(*x1_val);
xBC.rVal_i = cimag(*x1_val);
*spline = create_UBspline_1d_z (xgrid, xBC, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_s,FRECOMPUTE_UBSPLINE_1D_S)
(UBspline_1d_s **spline, float *data)
{
recompute_UBspline_1d_s (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_d,FRECOMPUTE_UBSPLINE_1D_D)
(UBspline_1d_d **spline, double *data)
{
recompute_UBspline_1d_d (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_c,FRECOMPUTE_UBSPLINE_1D_C)
(UBspline_1d_c **spline, complex_float *data)
{
recompute_UBspline_1d_c (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_z,FRECOMPUTE_UBSPLINE_1D_Z)
(UBspline_1d_z **spline, complex_double *data)
{
recompute_UBspline_1d_z (*spline, data);
}
////////
// 2D //
////////
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_s,FCREATE_UBSPLINE_2D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
float *data, UBspline_2d_s **spline)
{
Ugrid xgrid, ygrid;
BCtype_s xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
*spline = create_UBspline_2d_s (xgrid, ygrid, xBC, yBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_d,FCREATE_UBSPLINE_2D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
double *data, UBspline_2d_d **spline)
{
Ugrid xgrid, ygrid;
BCtype_d xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
*spline = create_UBspline_2d_d (xgrid, ygrid, xBC, yBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_c,FCREATE_UBSPLINE_2D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
complex_float *data, UBspline_2d_c **spline)
{
Ugrid xgrid, ygrid;
BCtype_c xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = crealf(*y0_val);
yBC.lVal_i = cimagf(*y0_val);
yBC.rVal_r = crealf(*y1_val);
yBC.rVal_i = cimagf(*y1_val);
*spline = create_UBspline_2d_c (xgrid, ygrid, xBC, yBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_z,FCREATE_UBSPLINE_2D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
complex_double *data, UBspline_2d_z **spline)
{
Ugrid xgrid, ygrid;
BCtype_z xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = creal(*y0_val);
yBC.lVal_i = cimag(*y0_val);
yBC.rVal_r = creal(*y1_val);
yBC.rVal_i = cimag(*y1_val);
*spline = create_UBspline_2d_z (xgrid, ygrid, xBC, yBC, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_s,FRECOMPUTE_UBSPLINE_2D_S)
(UBspline_2d_s **spline, float *data)
{
recompute_UBspline_2d_s (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_d,FRECOMPUTE_UBSPLINE_2D_D)
(UBspline_2d_d **spline, double *data)
{
recompute_UBspline_2d_d (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_c,FRECOMPUTE_UBSPLINE_2D_C)
(UBspline_2d_c **spline, complex_float *data)
{
recompute_UBspline_2d_c (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_z,FRECOMPUTE_UBSPLINE_2D_Z)
(UBspline_2d_z **spline, complex_double *data)
{
recompute_UBspline_2d_z (*spline, data);
}
////////
// 3D //
////////
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_s,FCREATE_UBSPLINE_3D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
int *z0_code, float *z0_val, int *z1_code, float *z1_val,
float *data, UBspline_3d_s **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_s xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal = *z0_val;
zBC.rVal = *z1_val;
*spline = create_UBspline_3d_s (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_d,FCREATE_UBSPLINE_3D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int *z0_code, double *z0_val, int *z1_code, double *z1_val,
double *data, UBspline_3d_d **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_d xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal = *z0_val;
zBC.rVal = *z1_val;
*spline = create_UBspline_3d_d (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_c,FCREATE_UBSPLINE_3D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
complex_float *data, UBspline_3d_c **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_c xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = crealf(*y0_val);
yBC.lVal_i = cimagf(*y0_val);
yBC.rVal_r = crealf(*y1_val);
yBC.rVal_i = cimagf(*y1_val);
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal_r = crealf(*z0_val);
zBC.lVal_i = cimagf(*z0_val);
zBC.rVal_r = crealf(*z1_val);
zBC.rVal_i = cimagf(*z1_val);
*spline = create_UBspline_3d_c (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
}
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_z,FCREATE_UBSPLINE_3D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
int *z0_code, complex_double *z0_val, int *z1_code, complex_double *z1_val,
complex_double *data, UBspline_3d_z **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_z xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = creal(*x0_val);
xBC.lVal_i = cimag(*x0_val);
xBC.rVal_r = creal(*x1_val);
xBC.rVal_i = cimag(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = creal(*y0_val);
yBC.lVal_i = cimag(*y0_val);
yBC.rVal_r = creal(*y1_val);
yBC.rVal_i = cimag(*y1_val);
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal_r = creal(*z0_val);
zBC.lVal_i = cimag(*z0_val);
zBC.rVal_r = creal(*z1_val);
zBC.rVal_i = cimag(*z1_val);
*spline = create_UBspline_3d_z (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_s,FRECOMPUTE_UBSPLINE_3D_S)
(UBspline_3d_s **spline, float *data)
{
recompute_UBspline_3d_s (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_d,FRECOMPUTE_UBSPLINE_3D_D)
(UBspline_3d_d **spline, double *data)
{
recompute_UBspline_3d_d (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_c,FRECOMPUTE_UBSPLINE_3D_C)
(UBspline_3d_c **spline, complex_float *data)
{
recompute_UBspline_3d_c (*spline, data);
}
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_z,FRECOMPUTE_UBSPLINE_3D_Z)
(UBspline_3d_z **spline, complex_double *data)
{
recompute_UBspline_3d_z (*spline, data);
}
CFUNC void
F77_FUNC_(fdestroy_bspline,FDESTROY_BSPLINE)
(Bspline **spline)
{
destroy_Bspline (*spline);
}
/////////////////////////
// Evaluation routines //
/////////////////////////
//////////////////////////////
// 1D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_s,FEVAL_UBSPLINE_1D_S)
(UBspline_1d_s **spline, double *x, float *val)
{
eval_UBspline_1d_s (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vg,FEVAL_UBSPLINE_1D_S_VG)
(UBspline_1d_s **spline, double *x, float *val, float *grad)
{
eval_UBspline_1d_s_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vgl,FEVAL_UBSPLINE_1D_S_VGL)
(UBspline_1d_s **spline, double *x,
float *val, float *grad, float *lapl)
{
eval_UBspline_1d_s_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vgh,FEVAL_UBSPLINE_1D_S_VGH)
(UBspline_1d_s **spline, double *x,
float *val, float *grad, float *hess)
{
eval_UBspline_1d_s_vgh (*spline, *x, val, grad, hess);
}
//////////////////////////////
// 1D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_d,FEVAL_UBSPLINE_1D_D)
(UBspline_1d_d **spline, double *x, double *val)
{
eval_UBspline_1d_d (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_d_vg,FEVAL_UBSPLINE_1D_D_VG)
(UBspline_1d_d **spline, double *x,
double *val, double *grad)
{
eval_UBspline_1d_d_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_d_vgl,FEVAL_UBSPLINE_1D_D_VGL)
(UBspline_1d_d **spline, double *x,
double *val, double *grad, double *lapl)
{
eval_UBspline_1d_d_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_d_vgh,FEVAL_UBSPLINE_1D_D_VGH)
(UBspline_1d_d **spline, double *x,
double *val, double *grad, double *hess)
{
eval_UBspline_1d_d_vgh (*spline, *x, val, grad, hess);
}
/////////////////////////////////
// 1D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_c,FEVAL_UBSPLINE_1D_C)
(UBspline_1d_c **spline, double *x, complex_float *val)
{
eval_UBspline_1d_c (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_c_vg,FEVAL_UBSPLINE_1D_C_VG)
(UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad)
{
eval_UBspline_1d_c_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_c_vgl,FEVAL_UBSPLINE_1D_C_VGL)
(UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_UBspline_1d_c_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_c_vgh,FEVAL_UBSPLINE_1D_C_VGH)
(UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_UBspline_1d_c_vgh (*spline, *x, val, grad, hess);
}
/////////////////////////////////
// 1D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_z,FEVAL_UBSPLINE_1D_Z)
(UBspline_1d_z **spline, double *x, complex_double *val)
{
eval_UBspline_1d_z (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_z_vg,FEVAL_UBSPLINE_1D_Z_VG)
(UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad)
{
eval_UBspline_1d_z_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_z_vgl,FEVAL_UBSPLINE_1D_Z_VGL)
(UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_UBspline_1d_z_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_1d_z_vgh,FEVAL_UBSPLINE_1D_Z_VGH)
(UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_UBspline_1d_z_vgh (*spline, *x, val, grad, hess);
}
//////////////////////////////
// 2D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_s,FEVAL_UBSPLINE_2D_S)
(UBspline_2d_s **spline, double *x, double *y, float *val)
{
eval_UBspline_2d_s (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_s_vg,FEVAL_UBSPLINE_2D_S_VG)
(UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad)
{
eval_UBspline_2d_s_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_s_vgl,FEVAL_UBSPLINE_2D_S_VGL)
(UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float* lapl)
{
eval_UBspline_2d_s_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_s_vgh,FEVAL_UBSPLINE_2D_S_VGH)
(UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float *hess)
{
eval_UBspline_2d_s_vgh (*spline, *x, *y, val, grad, hess);
}
//////////////////////////////
// 2D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_d,FEVAL_UBSPLINE_2D_D)
(UBspline_2d_d **spline, double *x, double *y, double *val)
{
eval_UBspline_2d_d (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_d_vg,FEVAL_UBSPLINE_2D_D_VG)
(UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad)
{
eval_UBspline_2d_d_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_d_vgl,FEVAL_UBSPLINE_2D_D_VGL)
(UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *lapl)
{
eval_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_d_vgh,FEVAL_UBSPLINE_2D_D_VGH)
(UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *hess)
{
eval_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, hess);
}
/////////////////////////////////
// 2D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_c,FEVAL_UBSPLINE_2D_C)
(UBspline_2d_c **spline, double *x, double *y, complex_float *val)
{
eval_UBspline_2d_c (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_c_vg,FEVAL_UBSPLINE_2D_C_VG)
(UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad)
{
eval_UBspline_2d_c_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_c_vgl,FEVAL_UBSPLINE_2D_C_VGL)
(UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_UBspline_2d_c_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_c_vgh,FEVAL_UBSPLINE_2D_C_VGH)
(UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_UBspline_2d_c_vgh (*spline, *x, *y, val, grad, hess);
}
/////////////////////////////////
// 2D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_z,FEVAL_UBSPLINE_2D_Z)
(UBspline_2d_z **spline, double *x, double *y, complex_double *val)
{
eval_UBspline_2d_z (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_z_vg,FEVAL_UBSPLINE_2D_Z_VG)
(UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad)
{
eval_UBspline_2d_z_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_z_vgl,FEVAL_UBSPLINE_2D_Z_VGL)
(UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_UBspline_2d_z_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_2d_z_vgh,FEVAL_UBSPLINE_2D_Z_VGH)
(UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_UBspline_2d_z_vgh (*spline, *x, *y, val, grad, hess);
}
//////////////////////////////
// 3D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_s,FEVAL_UBSPLINE_3D_S)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val)
{
eval_UBspline_3d_s (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_s_vg,FEVAL_UBSPLINE_3D_S_VG)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad)
{
eval_UBspline_3d_s_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_s_vgl,FEVAL_UBSPLINE_3D_S_VGL)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float* lapl)
{
eval_UBspline_3d_s_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_s_vgh,FEVAL_UBSPLINE_3D_S_VGH)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float *hess)
{
eval_UBspline_3d_s_vgh (*spline, *x, *y, *z, val, grad, hess);
}
//////////////////////////////
// 3D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_d,FEVAL_UBSPLINE_3D_D)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val)
{
eval_UBspline_3d_d (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_d_vg,FEVAL_UBSPLINE_3D_D_VG)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad)
{
eval_UBspline_3d_d_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_d_vgl,FEVAL_UBSPLINE_3D_D_VGL)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *lapl)
{
eval_UBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_d_vgh,FEVAL_UBSPLINE_3D_D_VGH)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *hess)
{
eval_UBspline_3d_d_vgh (*spline, *x, *y, *z, val, grad, hess);
}
/////////////////////////////////
// 3D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_c,FEVAL_UBSPLINE_3D_C)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val)
{
eval_UBspline_3d_c (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_c_vg,FEVAL_UBSPLINE_3D_C_VG)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad)
{
eval_UBspline_3d_c_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_c_vgl,FEVAL_UBSPLINE_3D_C_VGL)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_UBspline_3d_c_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_c_vgh,FEVAL_UBSPLINE_3D_C_VGH)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_UBspline_3d_c_vgh (*spline, *x, *y, *z, val, grad, hess);
}
/////////////////////////////////
// 3D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_z,FEVAL_UBSPLINE_3D_Z)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val)
{
eval_UBspline_3d_z (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_z_vg,FEVAL_UBSPLINE_3D_Z_VG)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad)
{
eval_UBspline_3d_z_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_z_vgl,FEVAL_UBSPLINE_3D_Z_VGL)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_UBspline_3d_z_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_ubspline_3d_z_vgh,FEVAL_UBSPLINE_3D_Z_VGH)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_UBspline_3d_z_vgh (*spline, *x, *y, *z, val, grad, hess);
}

440
src/einspline/fbspline.h Normal file
View File

@ -0,0 +1,440 @@
#ifndef F_BSPLINE_H
#define F_BSPLINE_H
#include "config.h"
#include "bspline_base.h"
#include "bspline_create.h"
#ifdef __cplusplus
#define CFUNC extern "C" /* Avoid name mangling in C++ */
#else
#define CFUNC
#endif
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Creation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
////////
// 1D //
////////
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_s,FCREATE_UBSPLINE_1D_S)
(double *x0, double *x1, int *num_x,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
float *data, UBspline_1d_s **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_d,FCREATE_UBSPLINE_1D_D)
(double *x0, double *x1, int *num_x,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
double *data, UBspline_1d_d **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_c,FCREATE_UBSPLINE_1D_C)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
complex_float *data, UBspline_1d_c **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_1d_z,FCREATE_UBSPLINE_1D_Z)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
complex_double *data, UBspline_1d_z **spline);
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_s,FRECOMPUTE_UBSPLINE_1D_S)
(UBspline_1d_s **spline, float *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_d,FRECOMPUTE_UBSPLINE_1D_D)
(UBspline_1d_d **spline, double *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_c,FRECOMPUTE_UBSPLINE_1D_C)
(UBspline_1d_c **spline, complex_float *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_1d_z,FRECOMPUTE_UBSPLINE_1D_Z)
(UBspline_1d_z **spline, complex_double *data);
////////
// 2D //
////////
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_s,FCREATE_UBSPLINE_2D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
float *data, UBspline_2d_s **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_d,FCREATE_UBSPLINE_2D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
double *data, UBspline_2d_d **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_c,FCREATE_UBSPLINE_2D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
complex_float *data, UBspline_2d_c **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_2d_z,FCREATE_UBSPLINE_2D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
complex_double *data, UBspline_2d_z **spline);
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_s,FRECOMPUTE_UBSPLINE_2D_S)
(UBspline_2d_s **spline, float *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_d,FRECOMPUTE_UBSPLINE_2D_D)
(UBspline_2d_d **spline, double *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_c,FRECOMPUTE_UBSPLINE_2D_C)
(UBspline_2d_c **spline, complex_float *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_2d_z,FRECOMPUTE_UBSPLINE_2D_Z)
(UBspline_2d_z **spline, complex_double *data);
////////
// 3D //
////////
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_s,FCREATE_UBSPLINE_3D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
int *z0_code, float *z0_val, int *z1_code, float *z1_val,
float *data, UBspline_3d_s **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_d,FCREATE_UBSPLINE_3D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int *z0_code, double *z0_val, int *z1_code, double *z1_val,
double *data, UBspline_3d_d **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_c,FCREATE_UBSPLINE_3D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
complex_float *data, UBspline_3d_c **spline);
CFUNC void
F77_FUNC_(fcreate_ubspline_3d_z,FCREATE_UBSPLINE_3D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
int *z0_code, complex_double *z0_val, int *z1_code, complex_double *z1_val,
complex_double *data, UBspline_3d_z **spline);
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_s,FRECOMPUTE_UBSPLINE_3D_S)
(UBspline_3d_s **spline, float *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_d,FRECOMPUTE_UBSPLINE_3D_D)
(UBspline_3d_d **spline, double *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_c,FRECOMPUTE_UBSPLINE_3D_C)
(UBspline_3d_c **spline, complex_float *data);
CFUNC void
F77_FUNC_(frecompute_ubspline_3d_z,FRECOMPUTE_UBSPLINE_3D_Z)
(UBspline_3d_z **spline, complex_double *data);
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Destruction routine ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
CFUNC void
F77_FUNC_(fdestroy_bspline,FDESTROY_BSPLINE)
(Bspline **spline);
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Evaluation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//////////////////////////////
// 1D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_s,FEVAL_UBSPLINE_1D_S)
(UBspline_1d_s **spline, double *x, float *val);
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vg,FEVAL_UBSPLINE_1D_S_VG)
(UBspline_1d_s **spline, double *x, float *val, float *grad);
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vgl,FEVAL_UBSPLINE_1D_S_VGL)
(UBspline_1d_s **spline, double *x,
float *val, float *grad, float *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vgh,FEVAL_UBSPLINE_1D_S_VGH)
(UBspline_1d_s **spline, double *x,
float *val, float *grad, float *hess);
//////////////////////////////
// 1D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_d,FEVAL_UBSPLINE_1D_D)
(UBspline_1d_d **spline, double *x, double *val);
CFUNC void
F77_FUNC_(feval_ubspline_1d_d_vg,FEVAL_UBSPLINE_1D_D_VG)
(UBspline_1d_d **spline, double *x,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_ubspline_1d_d_vgl,FEVAL_UBSPLINE_1D_D_VGL)
(UBspline_1d_d **spline, double *x,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_1d_d_vgh,FEVAL_UBSPLINE_1D_D_VGH)
(UBspline_1d_d **spline, double *x,
double *val, double *grad, double *hess);
/////////////////////////////////
// 1D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_c,FEVAL_UBSPLINE_1D_C)
(UBspline_1d_c **spline, double *x, complex_float *val);
CFUNC void
F77_FUNC_(feval_ubspline_1d_c_vg,FEVAL_UBSPLINE_1D_C_VG)
(UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_ubspline_1d_c_vgl,FEVAL_UBSPLINE_1D_C_VGL)
(UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_1d_c_vgh,FEVAL_UBSPLINE_1D_C_VGH)
(UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 1D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_z,FEVAL_UBSPLINE_1D_Z)
(UBspline_1d_z **spline, double *x, complex_double *val);
CFUNC void
F77_FUNC_(feval_ubspline_1d_z_vg,FEVAL_UBSPLINE_1D_Z_VG)
(UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_ubspline_1d_z_vgl,FEVAL_UBSPLINE_1D_Z_VGL)
(UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_1d_z_vgh,FEVAL_UBSPLINE_1D_Z_VGH)
(UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *hess);
//////////////////////////////
// 2D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_s,FEVAL_UBSPLINE_2D_S)
(UBspline_2d_s **spline, double *x, double *y, float *val);
CFUNC void
F77_FUNC_(feval_ubspline_2d_s_vg,FEVAL_UBSPLINE_2D_S_VG)
(UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad);
CFUNC void
F77_FUNC_(feval_ubspline_2d_s_vgl,FEVAL_UBSPLINE_2D_S_VGL)
(UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float* lapl);
CFUNC void
F77_FUNC_(feval_ubspline_2d_s_vgh,FEVAL_UBSPLINE_2D_S_VGH)
(UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float *hess);
//////////////////////////////
// 2D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_d,FEVAL_UBSPLINE_2D_D)
(UBspline_2d_d **spline, double *x, double *y, double *val);
CFUNC void
F77_FUNC_(feval_ubspline_2d_d_vg,FEVAL_UBSPLINE_2D_D_VG)
(UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_ubspline_2d_d_vgl,FEVAL_UBSPLINE_2D_D_VGL)
(UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_2d_d_vgh,FEVAL_UBSPLINE_2D_D_VGH)
(UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *hess);
/////////////////////////////////
// 2D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_c,FEVAL_UBSPLINE_2D_C)
(UBspline_2d_c **spline, double *x, double *y, complex_float *val);
CFUNC void
F77_FUNC_(feval_ubspline_2d_c_vg,FEVAL_UBSPLINE_2D_C_VG)
(UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_ubspline_2d_c_vgl,FEVAL_UBSPLINE_2D_C_VGL)
(UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_2d_c_vgh,FEVAL_UBSPLINE_2D_C_VGH)
(UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 2D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_2d_z,FEVAL_UBSPLINE_2D_Z)
(UBspline_2d_z **spline, double *x, double *y, complex_double *val);
CFUNC void
F77_FUNC_(feval_ubspline_2d_z_vg,FEVAL_UBSPLINE_2D_Z_VG)
(UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_ubspline_2d_z_vgl,FEVAL_UBSPLINE_2D_Z_VGL)
(UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_2d_z_vgh,FEVAL_UBSPLINE_2D_Z_VGH)
(UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *hess);
//////////////////////////////
// 3D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_s,FEVAL_UBSPLINE_3D_S)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val);
CFUNC void
F77_FUNC_(feval_ubspline_3d_s_vg,FEVAL_UBSPLINE_3D_S_VG)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad);
CFUNC void
F77_FUNC_(feval_ubspline_3d_s_vgl,FEVAL_UBSPLINE_3D_S_VGL)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float* lapl);
CFUNC void
F77_FUNC_(feval_ubspline_3d_s_vgh,FEVAL_UBSPLINE_3D_S_VGH)
(UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float *hess);
//////////////////////////////
// 3D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_d,FEVAL_UBSPLINE_3D_D)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val);
CFUNC void
F77_FUNC_(feval_ubspline_3d_d_vg,FEVAL_UBSPLINE_3D_D_VG)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_ubspline_3d_d_vgl,FEVAL_UBSPLINE_3D_D_VGL)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_3d_d_vgh,FEVAL_UBSPLINE_3D_D_VGH)
(UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *hess);
/////////////////////////////////
// 3D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_c,FEVAL_UBSPLINE_3D_C)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val);
CFUNC void
F77_FUNC_(feval_ubspline_3d_c_vg,FEVAL_UBSPLINE_3D_C_VG)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_ubspline_3d_c_vgl,FEVAL_UBSPLINE_3D_C_VGL)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_3d_c_vgh,FEVAL_UBSPLINE_3D_C_VGH)
(UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 3D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_3d_z,FEVAL_UBSPLINE_3D_Z)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val);
CFUNC void
F77_FUNC_(feval_ubspline_3d_z_vg,FEVAL_UBSPLINE_3D_Z_VG)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_ubspline_3d_z_vgl,FEVAL_UBSPLINE_3D_Z_VGL)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_3d_z_vgh,FEVAL_UBSPLINE_3D_Z_VGH)
(UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *hess);
#undef CFUNC
#endif

View File

@ -0,0 +1,908 @@
#include "multi_bspline_create.h"
#include "multi_bspline.h"
#include "fmulti_bspline.h"
#include "config.h"
#ifdef __cplusplus
#define CFUNC "C" /* Avoid name mangling in C++ */
#else
#define CFUNC
#endif
///////////////////////
// Creation routines //
///////////////////////
////////
// 1D //
////////
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_s,FCREATE_MULTI_UBSPLINE_1D_S)
(double *x0, double *x1, int *num_x,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *num_splines, multi_UBspline_1d_s **spline)
{
Ugrid xgrid;
BCtype_s xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
*spline = create_multi_UBspline_1d_s (xgrid, xBC, *num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_d,FCREATE_MULTI_UBSPLINE_1D_D)
(double *x0, double *x1, int *num_x,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *num_splines, multi_UBspline_1d_d **spline)
{
Ugrid xgrid;
BCtype_d xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
*spline = create_multi_UBspline_1d_d (xgrid, xBC, *num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_c,FCREATE_MULTI_UBSPLINE_1D_C)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *num_splines, multi_UBspline_1d_c **spline)
{
Ugrid xgrid;
BCtype_c xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
*spline = create_multi_UBspline_1d_c (xgrid, xBC, *num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_z,FCREATE_MULTI_UBSPLINE_1D_Z)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *num_splines, multi_UBspline_1d_z **spline)
{
Ugrid xgrid;
BCtype_z xBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = creal(*x0_val);
xBC.lVal_i = cimag(*x0_val);
xBC.rVal_r = creal(*x1_val);
xBC.rVal_i = cimag(*x1_val);
*spline = create_multi_UBspline_1d_z (xgrid, xBC, *num_splines);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_s,FSET_MULTI_UBSPLINE_1D_S)
(multi_UBspline_1d_s **spline, int *spline_num, float *data)
{
set_multi_UBspline_1d_s (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_d,FSET_MULTI_UBSPLINE_1D_D)
(multi_UBspline_1d_d **spline, int *spline_num, double *data)
{
set_multi_UBspline_1d_d (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_c,FSET_MULTI_UBSPLINE_1D_C)
(multi_UBspline_1d_c **spline, int *spline_num, complex_float *data)
{
set_multi_UBspline_1d_c (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_z,FSET_MULTI_UBSPLINE_1D_Z)
(multi_UBspline_1d_z **spline, int *spline_num, complex_double *data)
{
set_multi_UBspline_1d_z (*spline, *spline_num, data);
}
////////
// 2D //
////////
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_s,FCREATE_MULTI_UBSPLINE_2D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
int *num_splines, multi_UBspline_2d_s **spline)
{
Ugrid xgrid, ygrid;
BCtype_s xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
*spline = create_multi_UBspline_2d_s (xgrid, ygrid, xBC, yBC, *num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_d,FCREATE_MULTI_UBSPLINE_2D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int *num_splines, multi_UBspline_2d_d **spline)
{
Ugrid xgrid, ygrid;
BCtype_d xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
*spline = create_multi_UBspline_2d_d (xgrid, ygrid, xBC, yBC, *num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_c,FCREATE_MULTI_UBSPLINE_2D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
int *num_splines, multi_UBspline_2d_c **spline)
{
Ugrid xgrid, ygrid;
BCtype_c xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = crealf(*y0_val);
yBC.lVal_i = cimagf(*y0_val);
yBC.rVal_r = crealf(*y1_val);
yBC.rVal_i = cimagf(*y1_val);
*spline = create_multi_UBspline_2d_c (xgrid, ygrid, xBC, yBC, *num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_z,FCREATE_MULTI_UBSPLINE_2D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
int *num_splines, multi_UBspline_2d_z **spline)
{
Ugrid xgrid, ygrid;
BCtype_z xBC, yBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = creal(*y0_val);
yBC.lVal_i = cimag(*y0_val);
yBC.rVal_r = creal(*y1_val);
yBC.rVal_i = cimag(*y1_val);
*spline = create_multi_UBspline_2d_z (xgrid, ygrid, xBC, yBC, *num_splines);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_s,FSET_MULTI_UBSPLINE_2D_S)
(multi_UBspline_2d_s **spline, int *spline_num, float *data)
{
set_multi_UBspline_2d_s (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_d,FSET_MULTI_UBSPLINE_2D_D)
(multi_UBspline_2d_d **spline, int *spline_num, double *data)
{
set_multi_UBspline_2d_d (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_c,FSET_MULTI_UBSPLINE_2D_C)
(multi_UBspline_2d_c **spline, int *spline_num, complex_float *data)
{
set_multi_UBspline_2d_c (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_z,FSET_MULTI_UBSPLINE_2D_Z)
(multi_UBspline_2d_z **spline, int *spline_num, complex_double *data)
{
set_multi_UBspline_2d_z (*spline, *spline_num, data);
}
////////
// 3D //
////////
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_s,FCREATE_MULTI_UBSPLINE_3D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
int *z0_code, float *z0_val, int *z1_code, float *z1_val,
int *num_splines, multi_UBspline_3d_s **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_s xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal = *z0_val;
zBC.rVal = *z1_val;
*spline = create_multi_UBspline_3d_s (xgrid, ygrid, zgrid, xBC, yBC, zBC,
*num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_d,FCREATE_MULTI_UBSPLINE_3D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int *z0_code, double *z0_val, int *z1_code, double *z1_val,
int *num_splines, multi_UBspline_3d_d **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_d xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal = *z0_val;
zBC.rVal = *z1_val;
*spline = create_multi_UBspline_3d_d (xgrid, ygrid, zgrid, xBC, yBC, zBC,
*num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_c,FCREATE_MULTI_UBSPLINE_3D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
int *num_splines, multi_UBspline_3d_c **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_c xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = crealf(*y0_val);
yBC.lVal_i = cimagf(*y0_val);
yBC.rVal_r = crealf(*y1_val);
yBC.rVal_i = cimagf(*y1_val);
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal_r = crealf(*z0_val);
zBC.lVal_i = cimagf(*z0_val);
zBC.rVal_r = crealf(*z1_val);
zBC.rVal_i = cimagf(*z1_val);
*spline = create_multi_UBspline_3d_c (xgrid, ygrid, zgrid, xBC, yBC, zBC,
*num_splines);
}
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_z,FCREATE_MULTI_UBSPLINE_3D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
int *z0_code, complex_double *z0_val, int *z1_code, complex_double *z1_val,
int *num_splines, multi_UBspline_3d_z **spline)
{
Ugrid xgrid, ygrid, zgrid;
BCtype_z xBC, yBC, zBC;
xgrid.start = *x0;
xgrid.end = *x1;
xgrid.num = *num_x;
ygrid.start = *y0;
ygrid.end = *y1;
ygrid.num = *num_y;
zgrid.start = *z0;
zgrid.end = *z1;
zgrid.num = *num_z;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = creal(*x0_val);
xBC.lVal_i = cimag(*x0_val);
xBC.rVal_r = creal(*x1_val);
xBC.rVal_i = cimag(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = creal(*y0_val);
yBC.lVal_i = cimag(*y0_val);
yBC.rVal_r = creal(*y1_val);
yBC.rVal_i = cimag(*y1_val);
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal_r = creal(*z0_val);
zBC.lVal_i = cimag(*z0_val);
zBC.rVal_r = creal(*z1_val);
zBC.rVal_i = cimag(*z1_val);
*spline = create_multi_UBspline_3d_z (xgrid, ygrid, zgrid, xBC, yBC, zBC,
*num_splines);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_s,FSET_MULTI_UBSPLINE_3D_S)
(multi_UBspline_3d_s **spline, int *spline_num, float *data)
{
set_multi_UBspline_3d_s (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_d,FSET_MULTI_UBSPLINE_3D_D)
(multi_UBspline_3d_d **spline, int *spline_num, double *data)
{
set_multi_UBspline_3d_d (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_c,FSET_MULTI_UBSPLINE_3D_C)
(multi_UBspline_3d_c **spline, int *spline_num, complex_float *data)
{
set_multi_UBspline_3d_c (*spline, *spline_num, data);
}
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_z,FSET_MULTI_UBSPLINE_3D_Z)
(multi_UBspline_3d_z **spline, int *spline_num, complex_double *data)
{
set_multi_UBspline_3d_z (*spline, *spline_num, data);
}
/////////////////////////
// Evaluation routines //
/////////////////////////
//////////////////////////////
// 1D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_s,FEVAL_MULTI_UBSPLINE_1D_S)
(multi_UBspline_1d_s **spline, double *x, float *val)
{
eval_multi_UBspline_1d_s (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_s_vg,FEVAL_MULTI_UBSPLINE_1D_S_VG)
(multi_UBspline_1d_s **spline, double *x, float *val, float *grad)
{
eval_multi_UBspline_1d_s_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_s_vgl,FEVAL_MULTI_UBSPLINE_1D_S_VGL)
(multi_UBspline_1d_s **spline, double *x,
float *val, float *grad, float *lapl)
{
eval_multi_UBspline_1d_s_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_s_vgh,FEVAL_MULTI_UBSPLINE_1D_S_VGH)
(multi_UBspline_1d_s **spline, double *x,
float *val, float *grad, float *hess)
{
eval_multi_UBspline_1d_s_vgh (*spline, *x, val, grad, hess);
}
//////////////////////////////
// 1D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d,FEVAL_MULTI_UBSPLINE_1D_D)
(multi_UBspline_1d_d **spline, double *x, double *val)
{
eval_multi_UBspline_1d_d (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d_vg,FEVAL_MULTI_UBSPLINE_1D_D_VG)
(multi_UBspline_1d_d **spline, double *x,
double *val, double *grad)
{
eval_multi_UBspline_1d_d_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d_vgl,FEVAL_MULTI_UBSPLINE_1D_D_VGL)
(multi_UBspline_1d_d **spline, double *x,
double *val, double *grad, double *lapl)
{
eval_multi_UBspline_1d_d_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d_vgh,FEVAL_MULTI_UBSPLINE_1D_D_VGH)
(multi_UBspline_1d_d **spline, double *x,
double *val, double *grad, double *hess)
{
eval_multi_UBspline_1d_d_vgh (*spline, *x, val, grad, hess);
}
/////////////////////////////////
// 1D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c,FEVAL_MULTI_UBSPLINE_1D_C)
(multi_UBspline_1d_c **spline, double *x, complex_float *val)
{
eval_multi_UBspline_1d_c (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c_vg,FEVAL_MULTI_UBSPLINE_1D_C_VG)
(multi_UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad)
{
eval_multi_UBspline_1d_c_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c_vgl,FEVAL_MULTI_UBSPLINE_1D_C_VGL)
(multi_UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_multi_UBspline_1d_c_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c_vgh,FEVAL_MULTI_UBSPLINE_1D_C_VGH)
(multi_UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_multi_UBspline_1d_c_vgh (*spline, *x, val, grad, hess);
}
/////////////////////////////////
// 1D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z,FEVAL_MULTI_UBSPLINE_1D_Z)
(multi_UBspline_1d_z **spline, double *x, complex_double *val)
{
eval_multi_UBspline_1d_z (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z_vg,FEVAL_MULTI_UBSPLINE_1D_Z_VG)
(multi_UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad)
{
eval_multi_UBspline_1d_z_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z_vgl,FEVAL_MULTI_UBSPLINE_1D_Z_VGL)
(multi_UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_multi_UBspline_1d_z_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z_vgh,FEVAL_MULTI_UBSPLINE_1D_Z_VGH)
(multi_UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_multi_UBspline_1d_z_vgh (*spline, *x, val, grad, hess);
}
//////////////////////////////
// 2D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s,FEVAL_MULTI_UBSPLINE_2D_S)
(multi_UBspline_2d_s **spline, double *x, double *y, float *val)
{
eval_multi_UBspline_2d_s (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s_vg,FEVAL_MULTI_UBSPLINE_2D_S_VG)
(multi_UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad)
{
eval_multi_UBspline_2d_s_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s_vgl,FEVAL_MULTI_UBSPLINE_2D_S_VGL)
(multi_UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float* lapl)
{
eval_multi_UBspline_2d_s_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s_vgh,FEVAL_MULTI_UBSPLINE_2D_S_VGH)
(multi_UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float *hess)
{
eval_multi_UBspline_2d_s_vgh (*spline, *x, *y, val, grad, hess);
}
//////////////////////////////
// 2D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d,FEVAL_MULTI_UBSPLINE_2D_D)
(multi_UBspline_2d_d **spline, double *x, double *y, double *val)
{
eval_multi_UBspline_2d_d (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d_vg,FEVAL_MULTI_UBSPLINE_2D_D_VG)
(multi_UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad)
{
eval_multi_UBspline_2d_d_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d_vgl,FEVAL_MULTI_UBSPLINE_2D_D_VGL)
(multi_UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *lapl)
{
eval_multi_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d_vgh,FEVAL_MULTI_UBSPLINE_2D_D_VGH)
(multi_UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *hess)
{
eval_multi_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, hess);
}
/////////////////////////////////
// 2D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c,FEVAL_MULTI_UBSPLINE_2D_C)
(multi_UBspline_2d_c **spline, double *x, double *y, complex_float *val)
{
eval_multi_UBspline_2d_c (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c_vg,FEVAL_MULTI_UBSPLINE_2D_C_VG)
(multi_UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad)
{
eval_multi_UBspline_2d_c_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c_vgl,FEVAL_MULTI_UBSPLINE_2D_C_VGL)
(multi_UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_multi_UBspline_2d_c_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c_vgh,FEVAL_MULTI_UBSPLINE_2D_C_VGH)
(multi_UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_multi_UBspline_2d_c_vgh (*spline, *x, *y, val, grad, hess);
}
/////////////////////////////////
// 2D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z,FEVAL_MULTI_UBSPLINE_2D_Z)
(multi_UBspline_2d_z **spline, double *x, double *y, complex_double *val)
{
eval_multi_UBspline_2d_z (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z_vg,FEVAL_MULTI_UBSPLINE_2D_Z_VG)
(multi_UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad)
{
eval_multi_UBspline_2d_z_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z_vgl,FEVAL_MULTI_UBSPLINE_2D_Z_VGL)
(multi_UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_multi_UBspline_2d_z_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z_vgh,FEVAL_MULTI_UBSPLINE_2D_Z_VGH)
(multi_UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_multi_UBspline_2d_z_vgh (*spline, *x, *y, val, grad, hess);
}
//////////////////////////////
// 3D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s,FEVAL_MULTI_UBSPLINE_3D_S)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val)
{
eval_multi_UBspline_3d_s (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s_vg,FEVAL_MULTI_UBSPLINE_3D_S_VG)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad)
{
eval_multi_UBspline_3d_s_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s_vgl,FEVAL_MULTI_UBSPLINE_3D_S_VGL)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float* lapl)
{
eval_multi_UBspline_3d_s_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s_vgh,FEVAL_MULTI_UBSPLINE_3D_S_VGH)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float *hess)
{
eval_multi_UBspline_3d_s_vgh (*spline, *x, *y, *z, val, grad, hess);
}
//////////////////////////////
// 3D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d,FEVAL_MULTI_UBSPLINE_3D_D)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val)
{
eval_multi_UBspline_3d_d (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d_vg,FEVAL_MULTI_UBSPLINE_3D_D_VG)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad)
{
eval_multi_UBspline_3d_d_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d_vgl,FEVAL_MULTI_UBSPLINE_3D_D_VGL)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *lapl)
{
eval_multi_UBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d_vgh,FEVAL_MULTI_UBSPLINE_3D_D_VGH)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *hess)
{
eval_multi_UBspline_3d_d_vgh (*spline, *x, *y, *z, val, grad, hess);
}
/////////////////////////////////
// 3D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c,FEVAL_MULTI_UBSPLINE_3D_C)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val)
{
eval_multi_UBspline_3d_c (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c_vg,FEVAL_MULTI_UBSPLINE_3D_C_VG)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad)
{
eval_multi_UBspline_3d_c_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c_vgl,FEVAL_MULTI_UBSPLINE_3D_C_VGL)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_multi_UBspline_3d_c_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c_vgh,FEVAL_MULTI_UBSPLINE_3D_C_VGH)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_multi_UBspline_3d_c_vgh (*spline, *x, *y, *z, val, grad, hess);
}
/////////////////////////////////
// 3D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z,FEVAL_MULTI_UBSPLINE_3D_Z)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val)
{
eval_multi_UBspline_3d_z (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z_vg,FEVAL_MULTI_UBSPLINE_3D_Z_VG)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad)
{
eval_multi_UBspline_3d_z_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z_vgl,FEVAL_MULTI_UBSPLINE_3D_Z_VGL)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_multi_UBspline_3d_z_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z_vgh,FEVAL_MULTI_UBSPLINE_3D_Z_VGH)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_multi_UBspline_3d_z_vgh (*spline, *x, *y, *z, val, grad, hess);
}

View File

@ -0,0 +1,440 @@
#ifndef FMULTI_BSPLINE_H
#define FMULTI_BSPLINE_H
#include "config.h"
#include "bspline_base.h"
#include "bspline_create.h"
#ifdef __cplusplus
#define CFUNC extern "C" /* Avoid name mangling in C++ */
#else
#define CFUNC
#endif
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Creation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
////////
// 1D //
////////
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_s,FCREATE_MULTI_UBSPLINE_1D_S)
(double *x0, double *x1, int *num_x,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *num_spline, multi_UBspline_1d_s **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_d,FCREATE_MULTI_UBSPLINE_1D_D)
(double *x0, double *x1, int *num_x,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *num_splines, multi_UBspline_1d_d **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_c,FCREATE_MULTI_UBSPLINE_1D_C)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *num_splines, multi_UBspline_1d_c **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_1d_z,FCREATE_MULTI_UBSPLINE_1D_Z)
(double *x0, double *x1, int *num_x,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *num_splines, multi_UBspline_1d_z **spline);
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_s,FSET_MULTI_UBSPLINE_1D_S)
(multi_UBspline_1d_s **spline, int *spline_num, float *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_d,FSET_MULTI_UBSPLINE_1D_D)
(multi_UBspline_1d_d **spline, int *spline_num, double *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_c,FSET_MULTI_UBSPLINE_1D_C)
(multi_UBspline_1d_c **spline, int *spline_num, complex_float *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_1d_z,FSET_MULTI_UBSPLINE_1D_Z)
(multi_UBspline_1d_z **spline, int *spline_num, complex_double *data);
////////
// 2D //
////////
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_s,FCREATE_MULTI_UBSPLINE_2D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
int *num_splines, multi_UBspline_2d_s **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_d,FCREATE_MULTI_UBSPLINE_2D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int *num_splines, multi_UBspline_2d_d **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_c,FCREATE_MULTI_UBSPLINE_2D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
int *num_splines, multi_UBspline_2d_c **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_2d_z,FCREATE_MULTI_UBSPLINE_2D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
int *num_splines, multi_UBspline_2d_z **spline);
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_s,FSET_MULTI_UBSPLINE_2D_S)
(multi_UBspline_2d_s **spline, int *spline_num, float *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_d,FSET_MULTI_UBSPLINE_2D_D)
(multi_UBspline_2d_d **spline, int *spline_num, double *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_c,FSET_MULTI_UBSPLINE_2D_C)
(multi_UBspline_2d_c **spline, int *spline_num, complex_float *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_2d_z,FSET_MULTI_UBSPLINE_2D_Z)
(multi_UBspline_2d_z **spline, int *spline_num, complex_double *data);
////////
// 3D //
////////
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_s,FCREATE_MULTI_UBSPLINE_3D_S)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, float *x0_val, int *x1_code, float *x1_val,
int *y0_code, float *y0_val, int *y1_code, float *y1_val,
int *z0_code, float *z0_val, int *z1_code, float *z1_val,
int *num_splines, multi_UBspline_3d_s **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_d,FCREATE_MULTI_UBSPLINE_3D_D)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int *z0_code, double *z0_val, int *z1_code, double *z1_val,
int *num_splines, multi_UBspline_3d_d **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_c,FCREATE_MULTI_UBSPLINE_3D_C)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
int *num_splines, multi_UBspline_3d_c **spline);
CFUNC void
F77_FUNC_(fcreate_multi_ubspline_3d_z,FCREATE_MULTI_UBSPLINE_3D_Z)
(double *x0, double *x1, int *num_x,
double *y0, double *y1, int *num_y,
double *z0, double *z1, int *num_z,
int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
int *z0_code, complex_double *z0_val, int *z1_code, complex_double *z1_val,
int *num_splines, multi_UBspline_3d_z **spline);
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_s,FSET_MULTI_UBSPLINE_3D_S)
(multi_UBspline_3d_s **spline, int *spline_num, float *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_d,FSET_MULTI_UBSPLINE_3D_D)
(multi_UBspline_3d_d **spline, int *spline_num, double *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_c,FSET_MULTI_UBSPLINE_3D_C)
(multi_UBspline_3d_c **spline, int *spline_num, complex_float *data);
CFUNC void
F77_FUNC_(fset_multi_ubspline_3d_z,FSET_MULTI_UBSPLINE_3D_Z)
(multi_UBspline_3d_z **spline, int *spline_num, complex_double *data);
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Destruction routine ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
CFUNC void
F77_FUNC_(fdestroy_bspline,FDESTROY_BSPLINE)
(Bspline **spline);
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Evaluation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//////////////////////////////
// 1D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_ubspline_1d_s,FEVAL_UBSPLINE_1D_S)
(multi_UBspline_1d_s **spline, double *x, float *val);
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vg,FEVAL_UBSPLINE_1D_S_VG)
(multi_UBspline_1d_s **spline, double *x, float *val, float *grad);
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vgl,FEVAL_UBSPLINE_1D_S_VGL)
(multi_UBspline_1d_s **spline, double *x,
float *val, float *grad, float *lapl);
CFUNC void
F77_FUNC_(feval_ubspline_1d_s_vgh,FEVAL_UBSPLINE_1D_S_VGH)
(multi_UBspline_1d_s **spline, double *x,
float *val, float *grad, float *hess);
//////////////////////////////
// 1D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d,FEVAL_MULTI_UBSPLINE_1D_D)
(multi_UBspline_1d_d **spline, double *x, double *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d_vg,FEVAL_MULTI_UBSPLINE_1D_D_VG)
(multi_UBspline_1d_d **spline, double *x,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d_vgl,FEVAL_MULTI_UBSPLINE_1D_D_VGL)
(multi_UBspline_1d_d **spline, double *x,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_d_vgh,FEVAL_MULTI_UBSPLINE_1D_D_VGH)
(multi_UBspline_1d_d **spline, double *x,
double *val, double *grad, double *hess);
/////////////////////////////////
// 1D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c,FEVAL_MULTI_UBSPLINE_1D_C)
(multi_UBspline_1d_c **spline, double *x, complex_float *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c_vg,FEVAL_MULTI_UBSPLINE_1D_C_VG)
(multi_UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c_vgl,FEVAL_MULTI_UBSPLINE_1D_C_VGL)
(multi_UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_c_vgh,FEVAL_MULTI_UBSPLINE_1D_C_VGH)
(multi_UBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 1D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z,FEVAL_MULTI_UBSPLINE_1D_Z)
(multi_UBspline_1d_z **spline, double *x, complex_double *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z_vg,FEVAL_MULTI_UBSPLINE_1D_Z_VG)
(multi_UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z_vgl,FEVAL_MULTI_UBSPLINE_1D_Z_VGL)
(multi_UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_1d_z_vgh,FEVAL_MULTI_UBSPLINE_1D_Z_VGH)
(multi_UBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *hess);
//////////////////////////////
// 2D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s,FEVAL_MULTI_UBSPLINE_2D_S)
(multi_UBspline_2d_s **spline, double *x, double *y, float *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s_vg,FEVAL_MULTI_UBSPLINE_2D_S_VG)
(multi_UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s_vgl,FEVAL_MULTI_UBSPLINE_2D_S_VGL)
(multi_UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float* lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_s_vgh,FEVAL_MULTI_UBSPLINE_2D_S_VGH)
(multi_UBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float *hess);
//////////////////////////////
// 2D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d,FEVAL_MULTI_UBSPLINE_2D_D)
(multi_UBspline_2d_d **spline, double *x, double *y, double *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d_vg,FEVAL_MULTI_UBSPLINE_2D_D_VG)
(multi_UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d_vgl,FEVAL_MULTI_UBSPLINE_2D_D_VGL)
(multi_UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_d_vgh,FEVAL_MULTI_UBSPLINE_2D_D_VGH)
(multi_UBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *hess);
/////////////////////////////////
// 2D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c,FEVAL_MULTI_UBSPLINE_2D_C)
(multi_UBspline_2d_c **spline, double *x, double *y, complex_float *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c_vg,FEVAL_MULTI_UBSPLINE_2D_C_VG)
(multi_UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c_vgl,FEVAL_MULTI_UBSPLINE_2D_C_VGL)
(multi_UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_c_vgh,FEVAL_MULTI_UBSPLINE_2D_C_VGH)
(multi_UBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 2D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z,FEVAL_MULTI_UBSPLINE_2D_Z)
(multi_UBspline_2d_z **spline, double *x, double *y, complex_double *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z_vg,FEVAL_MULTI_UBSPLINE_2D_Z_VG)
(multi_UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z_vgl,FEVAL_MULTI_UBSPLINE_2D_Z_VGL)
(multi_UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_2d_z_vgh,FEVAL_MULTI_UBSPLINE_2D_Z_VGH)
(multi_UBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *hess);
//////////////////////////////
// 3D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s,FEVAL_MULTI_UBSPLINE_3D_S)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s_vg,FEVAL_MULTI_UBSPLINE_3D_S_VG)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s_vgl,FEVAL_MULTI_UBSPLINE_3D_S_VGL)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float* lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_s_vgh,FEVAL_MULTI_UBSPLINE_3D_S_VGH)
(multi_UBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float *hess);
//////////////////////////////
// 3D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d,FEVAL_MULTI_UBSPLINE_3D_D)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d_vg,FEVAL_MULTI_UBSPLINE_3D_D_VG)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d_vgl,FEVAL_MULTI_UBSPLINE_3D_D_VGL)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_d_vgh,FEVAL_MULTI_UBSPLINE_3D_D_VGH)
(multi_UBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *hess);
/////////////////////////////////
// 3D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c,FEVAL_MULTI_UBSPLINE_3D_C)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c_vg,FEVAL_MULTI_UBSPLINE_3D_C_VG)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c_vgl,FEVAL_MULTI_UBSPLINE_3D_C_VGL)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_c_vgh,FEVAL_MULTI_UBSPLINE_3D_C_VGH)
(multi_UBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 3D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z,FEVAL_MULTI_UBSPLINE_3D_Z)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z_vg,FEVAL_MULTI_UBSPLINE_3D_Z_VG)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z_vgl,FEVAL_MULTI_UBSPLINE_3D_Z_VGL)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_multi_ubspline_3d_z_vgh,FEVAL_MULTI_UBSPLINE_3D_Z_VGH)
(multi_UBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *hess);
#undef CFUNC
#endif

763
src/einspline/fnubspline.c Normal file
View File

@ -0,0 +1,763 @@
#include "fnubspline.h"
#include "config.h"
#include "nubspline_create.h"
#include "nubspline.h"
#ifdef __cplusplus
#define CFUNC extern "C" /* Avoid name mangling in C++ */
#else
#define CFUNC
#endif
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Grid Creation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
CFUNC void
F77_FUNC_(fcreate_general_grid,FCREATE_GENERAL_GRID)
(double *points, int *num_points, NUgrid **grid)
{
*grid = create_general_grid (points, *num_points);
}
CFUNC void
F77_FUNC_(fcreate_center_grid,FCREATE_CENTER_GRID)
(double *start, double *end, double *ratio,
int *num_points, NUgrid **grid)
{
*grid = create_center_grid (*start, *end, *ratio, *num_points);
}
CFUNC void
F77_FUNC_(fdestroy_grid,FDESTROY_GRID)
(NUgrid **grid)
{
destroy_grid (*grid);
}
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Nonuniform spline creation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
////////
// 1D //
////////
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_s,FCREATE_NUBSPLINE_1D_S)
(NUgrid **x_grid,
int* x0_code, float *x0_val, int *x1_code, float *x1_val,
float *data, NUBspline_1d_s **spline)
{
BCtype_s xBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
*spline = create_NUBspline_1d_s (*x_grid, xBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_d,FCREATE_NUBSPLINE_1D_D)
(NUgrid **x_grid,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
double *data, NUBspline_1d_d **spline)
{
BCtype_d xBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
*spline = create_NUBspline_1d_d (*x_grid, xBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_c,FCREATE_NUBSPLINE_1D_C)
(NUgrid **x_grid,
int *x0_code, complex_float *x0_val,
int *x1_code, complex_float *x1_val,
complex_float *data, NUBspline_1d_c **spline)
{
BCtype_c xBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
*spline = create_NUBspline_1d_c (*x_grid, xBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_z,FCREATE_NUBSPLINE_1D_Z)
(NUgrid **x_grid,
int *x0_code, complex_double *x0_val,
int *x1_code, complex_double *x1_val,
complex_double *data, NUBspline_1d_z **spline)
{
BCtype_z xBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = creal(*x0_val);
xBC.lVal_i = cimag(*x0_val);
xBC.rVal_r = creal(*x1_val);
xBC.rVal_i = cimag(*x1_val);
*spline = create_NUBspline_1d_z (*x_grid, xBC, data);
}
////////
// 2D //
////////
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_s,FCREATE_NUBSPLINE_2D_S)
(NUgrid **x_grid, NUgrid **y_grid,
int* x0_code, float *x0_val, int *x1_code, float *x1_val,
int* y0_code, float *y0_val, int *y1_code, float *y1_val,
float *data, NUBspline_2d_s **spline)
{
BCtype_s xBC, yBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
*spline = create_NUBspline_2d_s (*x_grid, *y_grid, xBC, yBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_d,FCREATE_NUBSPLINE_2D_D)
(NUgrid **x_grid, NUgrid **y_grid,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
double *data, NUBspline_2d_d **spline)
{
BCtype_d xBC, yBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
*spline = create_NUBspline_2d_d (*x_grid, *y_grid, xBC, yBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_c,FCREATE_NUBSPLINE_2D_C)
(NUgrid **x_grid, NUgrid **y_grid,
int *x0_code, complex_float *x0_val,
int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val,
int *y1_code, complex_float *y1_val,
complex_float *data, NUBspline_2d_c **spline)
{
BCtype_c xBC, yBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = crealf(*y0_val);
yBC.lVal_i = cimagf(*y0_val);
yBC.rVal_r = crealf(*y1_val);
yBC.rVal_i = cimagf(*y1_val);
*spline = create_NUBspline_2d_c (*x_grid, *y_grid, xBC, yBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_z,FCREATE_NUBSPLINE_2D_Z)
(NUgrid **x_grid, NUgrid **y_grid,
int *x0_code, complex_double *x0_val,
int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val,
int *y1_code, complex_double *y1_val,
complex_double *data, NUBspline_2d_z **spline)
{
BCtype_z xBC, yBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = creal(*x0_val);
xBC.lVal_i = cimag(*x0_val);
xBC.rVal_r = creal(*x1_val);
xBC.rVal_i = cimag(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = creal(*y0_val);
yBC.lVal_i = cimag(*y0_val);
yBC.rVal_r = creal(*y1_val);
yBC.rVal_i = cimag(*y1_val);
*spline = create_NUBspline_2d_z (*x_grid, *y_grid, xBC, yBC, data);
}
////////
// 3D //
////////
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_s,FCREATE_NUBSPLINE_3D_S)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int* x0_code, float *x0_val, int *x1_code, float *x1_val,
int* y0_code, float *y0_val, int *y1_code, float *y1_val,
int* z0_code, float *z0_val, int *z1_code, float *z1_val,
float *data, NUBspline_3d_s **spline)
{
BCtype_s xBC, yBC, zBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal = *z0_val;
zBC.rVal = *z1_val;
*spline = create_NUBspline_3d_s (*x_grid, *y_grid, *z_grid,
xBC, yBC, zBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_d,FCREATE_NUBSPLINE_3D_D)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int* z0_code, float *z0_val, int *z1_code, float *z1_val,
double *data, NUBspline_3d_d **spline)
{
BCtype_d xBC, yBC, zBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal = *x0_val;
xBC.rVal = *x1_val;
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal = *y0_val;
yBC.rVal = *y1_val;
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal = *z0_val;
zBC.rVal = *z1_val;
*spline = create_NUBspline_3d_d (*x_grid, *y_grid, *z_grid,
xBC, yBC, zBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_c,FCREATE_NUBSPLINE_3D_C)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int *x0_code, complex_float *x0_val,
int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val,
int *y1_code, complex_float *y1_val,
int *z0_code, complex_float *z0_val,
int *z1_code, complex_float *z1_val,
complex_float *data, NUBspline_3d_c **spline)
{
BCtype_c xBC, yBC, zBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = crealf(*x0_val);
xBC.lVal_i = cimagf(*x0_val);
xBC.rVal_r = crealf(*x1_val);
xBC.rVal_i = cimagf(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = crealf(*y0_val);
yBC.lVal_i = cimagf(*y0_val);
yBC.rVal_r = crealf(*y1_val);
yBC.rVal_i = cimagf(*y1_val);
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal_r = crealf(*z0_val);
zBC.lVal_i = cimagf(*z0_val);
zBC.rVal_r = crealf(*z1_val);
zBC.rVal_i = cimagf(*z1_val);
*spline = create_NUBspline_3d_c (*x_grid, *y_grid, *z_grid,
xBC, yBC, zBC, data);
}
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_z,FCREATE_NUBSPLINE_3D_Z)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int *x0_code, complex_double *x0_val,
int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val,
int *y1_code, complex_double *y1_val,
int *z0_code, complex_float *z0_val,
int *z1_code, complex_float *z1_val,
complex_double *data, NUBspline_3d_z **spline)
{
BCtype_z xBC, yBC, zBC;
xBC.lCode = (bc_code) *x0_code;
xBC.rCode = (bc_code) *x1_code;
xBC.lVal_r = creal(*x0_val);
xBC.lVal_i = cimag(*x0_val);
xBC.rVal_r = creal(*x1_val);
xBC.rVal_i = cimag(*x1_val);
yBC.lCode = (bc_code) *y0_code;
yBC.rCode = (bc_code) *y1_code;
yBC.lVal_r = creal(*y0_val);
yBC.lVal_i = cimag(*y0_val);
yBC.rVal_r = creal(*y1_val);
yBC.rVal_i = cimag(*y1_val);
zBC.lCode = (bc_code) *z0_code;
zBC.rCode = (bc_code) *z1_code;
zBC.lVal_r = creal(*z0_val);
zBC.lVal_i = cimag(*z0_val);
zBC.rVal_r = creal(*z1_val);
zBC.rVal_i = cimag(*z1_val);
*spline = create_NUBspline_3d_z (*x_grid, *y_grid, *z_grid,
xBC, yBC, zBC, data);
}
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Nonuniform spline evaluation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//////////////////////////////
// 1D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_1d_s,FEVAL_NUBSPLINE_1D_S)
(NUBspline_1d_s **spline, double *x, float *val)
{
eval_NUBspline_1d_s (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_s_vg,FEVAL_NUBSPLINE_1D_S_VG)
(NUBspline_1d_s **spline, double *x, float *val, float *grad)
{
eval_NUBspline_1d_s_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_s_vgl,FEVAL_NUBSPLINE_1D_S_VGL)
(NUBspline_1d_s **spline, double *x,
float *val, float *grad, float *lapl)
{
eval_NUBspline_1d_s_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_s_vgh,FEVAL_NUBSPLINE_1D_S_VGH)
(NUBspline_1d_s **spline, double *x,
float *val, float *grad, float *hess)
{
eval_NUBspline_1d_s_vgh (*spline, *x, val, grad, hess);
}
//////////////////////////////
// 1D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_1d_d,FEVAL_NUBSPLINE_1D_D)
(NUBspline_1d_d **spline, double *x, double *val)
{
eval_NUBspline_1d_d (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_d_vg,FEVAL_NUBSPLINE_1D_D_VG)
(NUBspline_1d_d **spline, double *x,
double *val, double *grad)
{
eval_NUBspline_1d_d_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_d_vgl,FEVAL_NUBSPLINE_1D_D_VGL)
(NUBspline_1d_d **spline, double *x,
double *val, double *grad, double *lapl)
{
eval_NUBspline_1d_d_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_d_vgh,FEVAL_NUBSPLINE_1D_D_VGH)
(NUBspline_1d_d **spline, double *x,
double *val, double *grad, double *hess)
{
eval_NUBspline_1d_d_vgh (*spline, *x, val, grad, hess);
}
/////////////////////////////////
// 1D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_1d_c,FEVAL_NUBSPLINE_1D_C)
(NUBspline_1d_c **spline, double *x, complex_float *val)
{
eval_NUBspline_1d_c (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_c_vg,FEVAL_NUBSPLINE_1D_C_VG)
(NUBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad)
{
eval_NUBspline_1d_c_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_c_vgl,FEVAL_NUBSPLINE_1D_C_VGL)
(NUBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_NUBspline_1d_c_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_c_vgh,FEVAL_NUBSPLINE_1D_C_VGH)
(NUBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_NUBspline_1d_c_vgh (*spline, *x, val, grad, hess);
}
/////////////////////////////////
// 1D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nnubspline_1d_z,FEVAL_NNUBSPLINE_1D_Z)
(NUBspline_1d_z **spline, double *x, complex_double *val)
{
eval_NUBspline_1d_z (*spline, *x, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_z_vg,FEVAL_NUBSPLINE_1D_Z_VG)
(NUBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad)
{
eval_NUBspline_1d_z_vg (*spline, *x, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_z_vgl,FEVAL_NUBSPLINE_1D_Z_VGL)
(NUBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_NUBspline_1d_z_vgl (*spline, *x, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_1d_z_vgh,FEVAL_NUBSPLINE_1D_Z_VGH)
(NUBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_NUBspline_1d_z_vgh (*spline, *x, val, grad, hess);
}
//////////////////////////////
// 2D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_s,FEVAL_NUBSPLINE_2D_S)
(NUBspline_2d_s **spline, double *x, double *y, float *val)
{
eval_NUBspline_2d_s (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_s_vg,FEVAL_NUBSPLINE_2D_S_VG)
(NUBspline_2d_s **spline, double *x, double *y,
float *val, float *grad)
{
eval_NUBspline_2d_s_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_s_vgl,FEVAL_NUBSPLINE_2D_S_VGL)
(NUBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float* lapl)
{
eval_NUBspline_2d_s_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_s_vgh,FEVAL_NUBSPLINE_2D_S_VGH)
(NUBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float *hess)
{
eval_NUBspline_2d_s_vgh (*spline, *x, *y, val, grad, hess);
}
//////////////////////////////
// 2D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_d,FEVAL_NUBSPLINE_2D_D)
(NUBspline_2d_d **spline, double *x, double *y, double *val)
{
eval_NUBspline_2d_d (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_d_vg,FEVAL_NUBSPLINE_2D_D_VG)
(NUBspline_2d_d **spline, double *x, double *y,
double *val, double *grad)
{
eval_NUBspline_2d_d_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_d_vgl,FEVAL_NUBSPLINE_2D_D_VGL)
(NUBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *lapl)
{
eval_NUBspline_2d_d_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_d_vgh,FEVAL_NUBSPLINE_2D_D_VGH)
(NUBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *hess)
{
eval_NUBspline_2d_d_vgl (*spline, *x, *y, val, grad, hess);
}
/////////////////////////////////
// 2D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_c,FEVAL_NUBSPLINE_2D_C)
(NUBspline_2d_c **spline, double *x, double *y, complex_float *val)
{
eval_NUBspline_2d_c (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_c_vg,FEVAL_NUBSPLINE_2D_C_VG)
(NUBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad)
{
eval_NUBspline_2d_c_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_c_vgl,FEVAL_NUBSPLINE_2D_C_VGL)
(NUBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_NUBspline_2d_c_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_c_vgh,FEVAL_NUBSPLINE_2D_C_VGH)
(NUBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_NUBspline_2d_c_vgh (*spline, *x, *y, val, grad, hess);
}
/////////////////////////////////
// 2D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_z,FEVAL_NUBSPLINE_2D_Z)
(NUBspline_2d_z **spline, double *x, double *y, complex_double *val)
{
eval_NUBspline_2d_z (*spline, *x, *y, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_z_vg,FEVAL_NUBSPLINE_2D_Z_VG)
(NUBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad)
{
eval_NUBspline_2d_z_vg (*spline, *x, *y, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_z_vgl,FEVAL_NUBSPLINE_2D_Z_VGL)
(NUBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_NUBspline_2d_z_vgl (*spline, *x, *y, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_2d_z_vgh,FEVAL_NUBSPLINE_2D_Z_VGH)
(NUBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_NUBspline_2d_z_vgh (*spline, *x, *y, val, grad, hess);
}
//////////////////////////////
// 3D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_s,FEVAL_NUBSPLINE_3D_S)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val)
{
eval_NUBspline_3d_s (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_s_vg,FEVAL_NUBSPLINE_3D_S_VG)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad)
{
eval_NUBspline_3d_s_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_s_vgl,FEVAL_NUBSPLINE_3D_S_VGL)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float* lapl)
{
eval_NUBspline_3d_s_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_s_vgh,FEVAL_NUBSPLINE_3D_S_VGH)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float *hess)
{
eval_NUBspline_3d_s_vgh (*spline, *x, *y, *z, val, grad, hess);
}
//////////////////////////////
// 3D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_d,FEVAL_NUBSPLINE_3D_D)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val)
{
eval_NUBspline_3d_d (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_d_vg,FEVAL_NUBSPLINE_3D_D_VG)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad)
{
eval_NUBspline_3d_d_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_d_vgl,FEVAL_NUBSPLINE_3D_D_VGL)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *lapl)
{
eval_NUBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_d_vgh,FEVAL_NUBSPLINE_3D_D_VGH)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *hess)
{
eval_NUBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, hess);
}
/////////////////////////////////
// 3D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_c,FEVAL_NUBSPLINE_3D_C)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val)
{
eval_NUBspline_3d_c (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_c_vg,FEVAL_NUBSPLINE_3D_C_VG)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad)
{
eval_NUBspline_3d_c_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_c_vgl,FEVAL_NUBSPLINE_3D_C_VGL)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *lapl)
{
eval_NUBspline_3d_c_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_c_vgh,FEVAL_NUBSPLINE_3D_C_VGH)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *hess)
{
eval_NUBspline_3d_c_vgh (*spline, *x, *y, *z, val, grad, hess);
}
/////////////////////////////////
// 3D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_z,FEVAL_NUBSPLINE_3D_Z)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val)
{
eval_NUBspline_3d_z (*spline, *x, *y, *z, val);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_z_vg,FEVAL_NUBSPLINE_3D_Z_VG)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad)
{
eval_NUBspline_3d_z_vg (*spline, *x, *y, *z, val, grad);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_z_vgl,FEVAL_NUBSPLINE_3D_Z_VGL)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *lapl)
{
eval_NUBspline_3d_z_vgl (*spline, *x, *y, *z, val, grad, lapl);
}
CFUNC void
F77_FUNC_(feval_nubspline_3d_z_vgh,FEVAL_NUBSPLINE_3D_Z_VGH)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *hess)
{
eval_NUBspline_3d_z_vgh (*spline, *x, *y, *z, val, grad, hess);
}

418
src/einspline/fnubspline.h Normal file
View File

@ -0,0 +1,418 @@
#ifndef F_NUBSPLINE_H
#define F_NUBSPLINE_H
#include "config.h"
#include "nugrid.h"
#include "nubspline_structs.h"
#ifdef __cplusplus
#define CFUNC extern "C" /* Avoid name mangling in C++ */
#else
#define CFUNC
#endif
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Grid Creation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
CFUNC void
F77_FUNC_(fcreate_general_grid,FCREATE_GENERAL_GRID)
(double *points, int *num_points, NUgrid **grid);
CFUNC void
F77_FUNC_(fcreate_center_grid,FCREATE_CENTER_GRID)
(double *start, double *end, double *ratio,
int *num_points, NUgrid **grid);
CFUNC void
F77_FUNC_(fdestroy_grid,FDESTROY_GRID)
(NUgrid **grid);
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Nonuniform spline creation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
////////
// 1D //
////////
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_s,FCREATE_NUBSPLINE_1D_S)
(NUgrid **x_grid,
int* x0_code, float *x0_val, int *x1_code, float *x1_val,
float *data, NUBspline_1d_s **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_d,FCREATE_NUBSPLINE_1D_D)
(NUgrid **x_grid,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
double *data, NUBspline_1d_d **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_c,FCREATE_NUBSPLINE_1D_C)
(NUgrid **x_grid,
int *x0_code, complex_float *x0_val,
int *x1_code, complex_float *x1_val,
complex_float *data, NUBspline_1d_c **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_1d_z,FCREATE_NUBSPLINE_1D_Z)
(NUgrid **x_grid,
int *x0_code, complex_double *x0_val,
int *x1_code, complex_double *x1_val,
complex_double *data, NUBspline_1d_z **spline);
////////
// 2D //
////////
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_s,FCREATE_NUBSPLINE_2D_S)
(NUgrid **x_grid, NUgrid **y_grid,
int* x0_code, float *x0_val, int *x1_code, float *x1_val,
int* y0_code, float *y0_val, int *y1_code, float *y1_val,
float *data, NUBspline_2d_s **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_d,FCREATE_NUBSPLINE_2D_D)
(NUgrid **x_grid, NUgrid **y_grid,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
double *data, NUBspline_2d_d **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_c,FCREATE_NUBSPLINE_2D_C)
(NUgrid **x_grid, NUgrid **y_grid,
int *x0_code, complex_float *x0_val,
int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val,
int *y1_code, complex_float *y1_val,
complex_float *data, NUBspline_2d_c **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_2d_z,FCREATE_NUBSPLINE_2D_Z)
(NUgrid **x_grid, NUgrid **y_grid,
int *x0_code, complex_double *x0_val,
int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val,
int *y1_code, complex_double *y1_val,
complex_double *data, NUBspline_2d_z **spline);
////////
// 3D //
////////
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_s,FCREATE_NUBSPLINE_3D_S)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int* x0_code, float *x0_val, int *x1_code, float *x1_val,
int* y0_code, float *y0_val, int *y1_code, float *y1_val,
int* z0_code, float *z0_val, int *z1_code, float *z1_val,
float *data, NUBspline_3d_s **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_d,FCREATE_NUBSPLINE_3D_D)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int *x0_code, double *x0_val, int *x1_code, double *x1_val,
int *y0_code, double *y0_val, int *y1_code, double *y1_val,
int* z0_code, float *z0_val, int *z1_code, float *z1_val,
double *data, NUBspline_3d_d **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_c,FCREATE_NUBSPLINE_3D_C)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int *x0_code, complex_float *x0_val,
int *x1_code, complex_float *x1_val,
int *y0_code, complex_float *y0_val,
int *y1_code, complex_float *y1_val,
int *z0_code, complex_float *z0_val,
int *z1_code, complex_float *z1_val,
complex_float *data, NUBspline_3d_c **spline);
CFUNC void
F77_FUNC_(fcreate_nubspline_3d_z,FCREATE_NUBSPLINE_3D_Z)
(NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
int *x0_code, complex_double *x0_val,
int *x1_code, complex_double *x1_val,
int *y0_code, complex_double *y0_val,
int *y1_code, complex_double *y1_val,
int *z0_code, complex_float *z0_val,
int *z1_code, complex_float *z1_val,
complex_double *data, NUBspline_3d_z **spline);
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//// Nonuniform spline evaluation routines ////
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//////////////////////////////
// 1D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_1d_s,FEVAL_NUBSPLINE_1D_S)
(NUBspline_1d_s **spline, double *x, float *val);
CFUNC void
F77_FUNC_(feval_nubspline_1d_s_vg,FEVAL_NUBSPLINE_1D_S_VG)
(NUBspline_1d_s **spline, double *x, float *val, float *grad);
CFUNC void
F77_FUNC_(feval_nubspline_1d_s_vgl,FEVAL_NUBSPLINE_1D_S_VGL)
(NUBspline_1d_s **spline, double *x,
float *val, float *grad, float *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_1d_s_vgh,FEVAL_NUBSPLINE_1D_S_VGH)
(NUBspline_1d_s **spline, double *x,
float *val, float *grad, float *hess);
//////////////////////////////
// 1D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_1d_d,FEVAL_NUBSPLINE_1D_D)
(NUBspline_1d_d **spline, double *x, double *val);
CFUNC void
F77_FUNC_(feval_nubspline_1d_d_vg,FEVAL_NUBSPLINE_1D_D_VG)
(NUBspline_1d_d **spline, double *x,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_nubspline_1d_d_vgl,FEVAL_NUBSPLINE_1D_D_VGL)
(NUBspline_1d_d **spline, double *x,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_1d_d_vgh,FEVAL_NUBSPLINE_1D_D_VGH)
(NUBspline_1d_d **spline, double *x,
double *val, double *grad, double *hess);
/////////////////////////////////
// 1D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_1d_c,FEVAL_NUBSPLINE_1D_C)
(NUBspline_1d_c **spline, double *x, complex_float *val);
CFUNC void
F77_FUNC_(feval_nubspline_1d_c_vg,FEVAL_NUBSPLINE_1D_C_VG)
(NUBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_nubspline_1d_c_vgl,FEVAL_NUBSPLINE_1D_C_VGL)
(NUBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_1d_c_vgh,FEVAL_NUBSPLINE_1D_C_VGH)
(NUBspline_1d_c **spline, double *x,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 1D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nnubspline_1d_z,FEVAL_NNUBSPLINE_1D_Z)
(NUBspline_1d_z **spline, double *x, complex_double *val);
CFUNC void
F77_FUNC_(feval_nubspline_1d_z_vg,FEVAL_NUBSPLINE_1D_Z_VG)
(NUBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_nubspline_1d_z_vgl,FEVAL_NUBSPLINE_1D_Z_VGL)
(NUBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_1d_z_vgh,FEVAL_NUBSPLINE_1D_Z_VGH)
(NUBspline_1d_z **spline, double *x,
complex_double *val, complex_double *grad, complex_double *hess);
//////////////////////////////
// 2D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_s,FEVAL_NUBSPLINE_2D_S)
(NUBspline_2d_s **spline, double *x, double *y, float *val);
CFUNC void
F77_FUNC_(feval_nubspline_2d_s_vg,FEVAL_NUBSPLINE_2D_S_VG)
(NUBspline_2d_s **spline, double *x, double *y,
float *val, float *grad);
CFUNC void
F77_FUNC_(feval_nubspline_2d_s_vgl,FEVAL_NUBSPLINE_2D_S_VGL)
(NUBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float* lapl);
CFUNC void
F77_FUNC_(feval_nubspline_2d_s_vgh,FEVAL_NUBSPLINE_2D_S_VGH)
(NUBspline_2d_s **spline, double *x, double *y,
float *val, float *grad, float *hess);
//////////////////////////////
// 2D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_d,FEVAL_NUBSPLINE_2D_D)
(NUBspline_2d_d **spline, double *x, double *y, double *val);
CFUNC void
F77_FUNC_(feval_nubspline_2d_d_vg,FEVAL_NUBSPLINE_2D_D_VG)
(NUBspline_2d_d **spline, double *x, double *y,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_nubspline_2d_d_vgl,FEVAL_NUBSPLINE_2D_D_VGL)
(NUBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_2d_d_vgh,FEVAL_NUBSPLINE_2D_D_VGH)
(NUBspline_2d_d **spline, double *x, double *y,
double *val, double *grad, double *hess);
/////////////////////////////////
// 2D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_c,FEVAL_NUBSPLINE_2D_C)
(NUBspline_2d_c **spline, double *x, double *y, complex_float *val);
CFUNC void
F77_FUNC_(feval_nubspline_2d_c_vg,FEVAL_NUBSPLINE_2D_C_VG)
(NUBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_nubspline_2d_c_vgl,FEVAL_NUBSPLINE_2D_C_VGL)
(NUBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_2d_c_vgh,FEVAL_NUBSPLINE_2D_C_VGH)
(NUBspline_2d_c **spline, double *x, double *y,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 2D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_2d_z,FEVAL_NUBSPLINE_2D_Z)
(NUBspline_2d_z **spline, double *x, double *y, complex_double *val);
CFUNC void
F77_FUNC_(feval_nubspline_2d_z_vg,FEVAL_NUBSPLINE_2D_Z_VG)
(NUBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_nubspline_2d_z_vgl,FEVAL_NUBSPLINE_2D_Z_VGL)
(NUBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_2d_z_vgh,FEVAL_NUBSPLINE_2D_Z_VGH)
(NUBspline_2d_z **spline, double *x, double *y,
complex_double *val, complex_double *grad, complex_double *hess);
//////////////////////////////
// 3D single-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_s,FEVAL_NUBSPLINE_3D_S)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val);
CFUNC void
F77_FUNC_(feval_nubspline_3d_s_vg,FEVAL_NUBSPLINE_3D_S_VG)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad);
CFUNC void
F77_FUNC_(feval_nubspline_3d_s_vgl,FEVAL_NUBSPLINE_3D_S_VGL)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float* lapl);
CFUNC void
F77_FUNC_(feval_nubspline_3d_s_vgh,FEVAL_NUBSPLINE_3D_S_VGH)
(NUBspline_3d_s **spline, double *x, double *y, double *z,
float *val, float *grad, float *hess);
//////////////////////////////
// 3D double-precision real //
//////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_d,FEVAL_NUBSPLINE_3D_D)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val);
CFUNC void
F77_FUNC_(feval_nubspline_3d_d_vg,FEVAL_NUBSPLINE_3D_D_VG)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad);
CFUNC void
F77_FUNC_(feval_nubspline_3d_d_vgl,FEVAL_NUBSPLINE_3D_D_VGL)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_3d_d_vgh,FEVAL_NUBSPLINE_3D_D_VGH)
(NUBspline_3d_d **spline, double *x, double *y, double *z,
double *val, double *grad, double *hess);
/////////////////////////////////
// 3D single-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_c,FEVAL_NUBSPLINE_3D_C)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val);
CFUNC void
F77_FUNC_(feval_nubspline_3d_c_vg,FEVAL_NUBSPLINE_3D_C_VG)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad);
CFUNC void
F77_FUNC_(feval_nubspline_3d_c_vgl,FEVAL_NUBSPLINE_3D_C_VGL)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_3d_c_vgh,FEVAL_NUBSPLINE_3D_C_VGH)
(NUBspline_3d_c **spline, double *x, double *y, double *z,
complex_float *val, complex_float *grad, complex_float *hess);
/////////////////////////////////
// 3D double-precision complex //
/////////////////////////////////
CFUNC void
F77_FUNC_(feval_nubspline_3d_z,FEVAL_NUBSPLINE_3D_Z)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val);
CFUNC void
F77_FUNC_(feval_nubspline_3d_z_vg,FEVAL_NUBSPLINE_3D_Z_VG)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad);
CFUNC void
F77_FUNC_(feval_nubspline_3d_z_vgl,FEVAL_NUBSPLINE_3D_Z_VGL)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *lapl);
CFUNC void
F77_FUNC_(feval_nubspline_3d_z_vgh,FEVAL_NUBSPLINE_3D_Z_VGH)
(NUBspline_3d_z **spline, double *x, double *y, double *z,
complex_double *val, complex_double *grad, complex_double *hess);
#endif

View File

@ -0,0 +1,40 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef MULTI_BSPLINE_H
#define MULTI_BSPLINE_H
#include "bspline_base.h"
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
//// Bspline structure definitions ////
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
#include "multi_bspline_structs.h"
// Currently, some of the single-precision routines use SSE2 instructions
#include "multi_bspline_eval_s.h"
#include "multi_bspline_eval_c.h"
#include "multi_bspline_eval_d.h"
#include "multi_bspline_eval_z.h"
#include "bspline_create.h"
#include "multi_bspline_create.h"
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,177 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef MULTI_BSPLINE_CREATE_H
#define MULTI_BSPLINE_CREATE_H
#include "bspline_base.h"
#include "multi_bspline_structs.h"
#ifdef __cplusplus
extern "C" {
#endif
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
//// Spline creation functions ////
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
/////////////////////////////////////
// Uniform, single precision, real //
/////////////////////////////////////
// Create 1D uniform single-precision, real Bspline
multi_UBspline_1d_s *
create_multi_UBspline_1d_s (Ugrid x_grid, BCtype_s xBC, int num_splines);
// Create 2D uniform single-precision, real Bspline
multi_UBspline_2d_s *
create_multi_UBspline_2d_s (Ugrid x_grid, Ugrid y_grid,
BCtype_s xBC, BCtype_s yBC,
int num_splines);
// Create 3D uniform single-precision, real Bspline
multi_UBspline_3d_s *
create_multi_UBspline_3d_s (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_s xBC, BCtype_s yBC, BCtype_s zBC,
int num_splines);
// Set the data for the splines, and compute spline coefficients
void
set_multi_UBspline_1d_s (multi_UBspline_1d_s *spline,
int spline_num, float *data);
void
set_multi_UBspline_2d_s (multi_UBspline_2d_s *spline,
int spline_num, float *data);
void
set_multi_UBspline_3d_s (multi_UBspline_3d_s *spline,
int spline_num, float *data);
/////////////////////////////////////
// Uniform, double precision, real //
/////////////////////////////////////
// Create 1D uniform single-precision, real Bspline
multi_UBspline_1d_d *
create_multi_UBspline_1d_d (Ugrid x_grid, BCtype_d xBC, int num_splines);
// Create 2D uniform single-precision, real Bspline
multi_UBspline_2d_d *
create_multi_UBspline_2d_d (Ugrid x_grid, Ugrid y_grid,
BCtype_d xBC, BCtype_d yBC,
int num_splines);
// Create 3D uniform single-precision, real Bspline
multi_UBspline_3d_d *
create_multi_UBspline_3d_d (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_d xBC, BCtype_d yBC, BCtype_d zBC,
int num_splines);
// Set the data for the splines, and compute spline coefficients
void
set_multi_UBspline_1d_d (multi_UBspline_1d_d *spline,
int spline_num, double *data);
void
set_multi_UBspline_1d_d_BC (multi_UBspline_1d_d *spline,
int spline_num, double *data, BCtype_d xBC);
void
set_multi_UBspline_2d_d (multi_UBspline_2d_d *spline,
int spline_num, double *data);
void
set_multi_UBspline_3d_d (multi_UBspline_3d_d *spline,
int spline_num, double *data);
///////////////////////////////////////
// Uniform, single precision, complex//
///////////////////////////////////////
// Create 1D uniform single-precision, real Bspline
multi_UBspline_1d_c *
create_multi_UBspline_1d_c (Ugrid x_grid, BCtype_c xBC, int num_splines);
// Create 2D uniform single-precision, real Bspline
multi_UBspline_2d_c *
create_multi_UBspline_2d_c (Ugrid x_grid, Ugrid y_grid,
BCtype_c xBC, BCtype_c yBC,
int num_splines);
// Create 3D uniform single-precision, real Bspline
multi_UBspline_3d_c *
create_multi_UBspline_3d_c (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_c xBC, BCtype_c yBC, BCtype_c zBC,
int num_splines);
// Set the data for the splines, and compute spline coefficients
void
set_multi_UBspline_1d_c (multi_UBspline_1d_c *spline, int spline_num,
complex_float *data);
void
set_multi_UBspline_2d_c (multi_UBspline_2d_c *spline, int spline_num,
complex_float *data);
void
set_multi_UBspline_3d_c (multi_UBspline_3d_c *spline, int spline_num,
complex_float *data);
///////////////////////////////////////
// Uniform, double precision, complex//
///////////////////////////////////////
// Create 1D uniform double-precision, complex Bspline
multi_UBspline_1d_z *
create_multi_UBspline_1d_z (Ugrid x_grid, BCtype_z xBC, int num_splines);
// Create 2D uniform double-precision, complex Bspline
multi_UBspline_2d_z *
create_multi_UBspline_2d_z (Ugrid x_grid, Ugrid y_grid,
BCtype_z xBC, BCtype_z yBC,
int num_splines);
// Create 3D uniform double-precision, complex Bspline
multi_UBspline_3d_z *
create_multi_UBspline_3d_z (Ugrid x_grid, Ugrid y_grid, Ugrid z_grid,
BCtype_z xBC, BCtype_z yBC, BCtype_z zBC,
int num_splines);
// Set the data for the splines, and compute spline coefficients
void
set_multi_UBspline_1d_z (multi_UBspline_1d_z *spline, int spline_num,
complex_double *data);
void
set_multi_UBspline_1d_z_BC (multi_UBspline_1d_z *spline, int spline_num,
complex_double *data, BCtype_z xBC);
void
set_multi_UBspline_2d_z (multi_UBspline_2d_z *spline, int spline_num,
complex_double *data);
void
set_multi_UBspline_3d_z (multi_UBspline_3d_z *spline, int spline_num,
complex_double *data);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,735 @@
#include <stdio.h>
#include "multi_bspline.h"
#include "multi_bspline_structs_cuda.h"
__device__ double Bcuda[48];
__constant__ float Acuda[48];
#include "multi_bspline_cuda_s_impl.h"
#include "multi_bspline_cuda_c_impl.h"
#include "multi_bspline_cuda_d_impl.h"
#include "multi_bspline_cuda_z_impl.h"
#define COALLESCED_SIZE 16
extern "C" multi_UBspline_1d_s_cuda*
create_multi_UBspline_1d_s_cuda (multi_UBspline_1d_s* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_1d_s_cuda *cuda_spline =
(multi_UBspline_1d_s_cuda*) malloc (sizeof (multi_UBspline_1d_s_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride = N;
cuda_spline->gridInv = spline->x_grid.delta_inv;
cuda_spline->dim = spline->x_grid.num;
size_t size = Nx*N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
float *spline_buff = (float*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int isp=0; isp<spline->num_splines; isp++)
spline_buff[ix*cuda_spline->stride + isp] =
spline->coefs[ix*spline->x_stride + isp];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" multi_UBspline_1d_s_cuda*
create_multi_UBspline_1d_s_cuda_conv (multi_UBspline_1d_d* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_1d_s_cuda *cuda_spline =
(multi_UBspline_1d_s_cuda*) malloc (sizeof (multi_UBspline_1d_s_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride = N;
cuda_spline->gridInv = spline->x_grid.delta_inv;
cuda_spline->dim = spline->x_grid.num;
size_t size = Nx*N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
float *spline_buff = (float*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int isp=0; isp<spline->num_splines; isp++)
spline_buff[ix*cuda_spline->stride + isp] =
(float)spline->coefs[ix*spline->x_stride + isp];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" multi_UBspline_1d_c_cuda*
create_multi_UBspline_1d_c_cuda (multi_UBspline_1d_c* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_1d_c_cuda *cuda_spline =
(multi_UBspline_1d_c_cuda*) malloc (sizeof (multi_UBspline_1d_c_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride = N;
cuda_spline->gridInv = spline->x_grid.delta_inv;
cuda_spline->dim = spline->x_grid.num;
size_t size = Nx*N*sizeof(complex_float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
complex_float *spline_buff = (complex_float*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int isp=0; isp<spline->num_splines; isp++)
spline_buff[ix*cuda_spline->stride + isp] =
spline->coefs[ix*spline->x_stride + isp];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" multi_UBspline_1d_c_cuda*
create_multi_UBspline_1d_c_cuda_conv (multi_UBspline_1d_z* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "Error copying A matrix to GPU constant memory: Erorr = %s\n",
cudaGetErrorString(err));
abort();
}
multi_UBspline_1d_c_cuda *cuda_spline =
(multi_UBspline_1d_c_cuda*) malloc (sizeof (multi_UBspline_1d_c_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride = N;
cuda_spline->gridInv = spline->x_grid.delta_inv;
cuda_spline->dim = spline->x_grid.num;
size_t size = Nx*N*sizeof(complex_float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
complex_float *spline_buff = (complex_float*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int isp=0; isp<spline->num_splines; isp++)
spline_buff[ix*cuda_spline->stride + isp] =
spline->coefs[ix*spline->x_stride + isp];
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" multi_UBspline_3d_c_cuda*
create_multi_UBspline_3d_c_cuda (multi_UBspline_3d_c* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_c_cuda *cuda_spline =
(multi_UBspline_3d_c_cuda*) malloc (sizeof (multi_UBspline_3d_c_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*Nz*N*sizeof(std::complex<float>);
cudaMalloc((void**)&(cuda_spline->coefs), size);
std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++) {
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp];
}
for (int isp=spline->num_splines; isp < N; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] = 0.0;
}
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
cuda_spline->stride.x = 2*Ny*Nz*N;
cuda_spline->stride.y = 2*Nz*N;
cuda_spline->stride.z = 2*N;
return cuda_spline;
}
extern "C" multi_UBspline_3d_c_cuda*
create_multi_UBspline_3d_c_cuda_conv (multi_UBspline_3d_z* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_c_cuda *cuda_spline =
(multi_UBspline_3d_c_cuda*) malloc (sizeof (multi_UBspline_3d_c_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*Nz*N*sizeof(std::complex<float>);
cudaMalloc((void**)&(cuda_spline->coefs), size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "Failed to allocate %ld memory for GPU spline coefficients. Error %s\n",
size, cudaGetErrorString(err));
abort();
}
std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++) {
for (int isp=0; isp<spline->num_splines; isp++) {
std::complex<double> z = spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp];
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] = std::complex<float>(z.real(), z.imag());
}
for (int isp=spline->num_splines; isp < N; isp++)
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] = 0.0;
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "Failed to copy spline to GPU memory. Error: %s\n",
cudaGetErrorString(err));
abort();
}
free(spline_buff);
cuda_spline->stride.x = 2*Ny*Nz*N;
cuda_spline->stride.y = 2*Nz*N;
cuda_spline->stride.z = 2*N;
return cuda_spline;
}
extern "C" multi_UBspline_3d_s_cuda*
create_multi_UBspline_3d_s_cuda (multi_UBspline_3d_s* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_s_cuda *cuda_spline =
(multi_UBspline_3d_s_cuda*) malloc (sizeof (multi_UBspline_3d_s_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*Nz*N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
float *spline_buff = (float*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp];
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" multi_UBspline_3d_s_cuda*
create_multi_UBspline_3d_s_cuda_conv (multi_UBspline_3d_d* spline)
{
fprintf (stderr, "In create_multi_UBspline_3d_s_cuda_conv.\n");
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_s_cuda *cuda_spline =
(multi_UBspline_3d_s_cuda*) malloc (sizeof (multi_UBspline_3d_s_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*Nz*N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "Failed to allocate %ld memory for GPU spline coefficients. Error %s\n",
size, cudaGetErrorString(err));
abort();
}
float *spline_buff = (float*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp];
// if (isnan (spline->coefs[ix*spline->x_stride +
// iy*spline->y_stride +
// iz*spline->z_stride + isp]))
// fprintf (stderr, "NAN at ix=%d iy=%d iz=%d isp=%d\n",
// ix,iy,iz,isp);
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "Failed to copy spline to GPU memory. Error: %s\n",
cudaGetErrorString(err));
abort();
}
free(spline_buff);
return cuda_spline;
}
extern "C" multi_UBspline_3d_d_cuda*
create_multi_UBspline_3d_d_cuda (multi_UBspline_3d_d* spline)
{
double B_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_d_cuda *cuda_spline =
(multi_UBspline_3d_d_cuda*) malloc (sizeof (multi_UBspline_3d_d_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*Nz*N*sizeof(double);
cudaMalloc((void**)&(cuda_spline->coefs), size);
double *spline_buff = (double*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp];
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
extern "C" multi_UBspline_3d_z_cuda*
create_multi_UBspline_3d_z_cuda (multi_UBspline_3d_z* spline)
{
double B_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_z_cuda *cuda_spline =
(multi_UBspline_3d_z_cuda*) malloc (sizeof (multi_UBspline_3d_z_cuda));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%COALLESCED_SIZE) != 0)
N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
cuda_spline->dim.x = spline->x_grid.num;
cuda_spline->dim.y = spline->y_grid.num;
cuda_spline->dim.z = spline->z_grid.num;
size_t size = Nx*Ny*Nz*N*sizeof(std::complex<double>);
cudaMalloc((void**)&(cuda_spline->coefs), size);
std::complex<double> *spline_buff = (std::complex<double>*)malloc(size);
if (!spline_buff) {
fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
abort();
}
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp];
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
cuda_spline->stride.x = 2*Ny*Nz*N;
cuda_spline->stride.y = 2*Nz*N;
cuda_spline->stride.z = 2*N;
free(spline_buff);
return cuda_spline;
}

View File

@ -0,0 +1,80 @@
#ifndef MULTI_BSPLINE_CREATE_CUDA_H
#define MULTI_BSPLINE_CREATE_CUDA_H
#include "multi_bspline_structs_cuda.h"
////////
// 1D //
////////
extern "C" multi_UBspline_1d_s_cuda*
create_multi_UBspline_1d_s_cuda (multi_UBspline_1d_s* spline);
extern "C" multi_UBspline_1d_s_cuda*
create_multi_UBspline_1d_s_cuda_conv (multi_UBspline_1d_d* spline);
extern "C" multi_UBspline_1d_c_cuda*
create_multi_UBspline_1d_c_cuda (multi_UBspline_1d_c* spline);
extern "C" multi_UBspline_1d_c_cuda*
create_multi_UBspline_1d_c_cuda_conv (multi_UBspline_1d_z* spline);
extern "C" multi_UBspline_1d_d_cuda*
create_multi_UBspline_1d_d_cuda (multi_UBspline_1d_d* spline);
extern "C" multi_UBspline_1d_z_cuda*
create_multi_UBspline_1d_z_cuda (multi_UBspline_1d_z* spline);
////////
// 2D //
////////
extern "C" multi_UBspline_2d_s_cuda*
create_multi_UBspline_2d_s_cuda (multi_UBspline_2d_s* spline);
extern "C" multi_UBspline_2d_s_cuda*
create_multi_UBspline_2d_s_cuda_conv (multi_UBspline_2d_d* spline);
extern "C" multi_UBspline_2d_c_cuda*
create_multi_UBspline_2d_c_cuda (multi_UBspline_2d_c* spline);
extern "C" multi_UBspline_2d_c_cuda*
create_multi_UBspline_2d_c_cuda_conv (multi_UBspline_2d_z* spline);
extern "C" multi_UBspline_2d_d_cuda*
create_multi_UBspline_2d_d_cuda (multi_UBspline_2d_d* spline);
extern "C" multi_UBspline_2d_z_cuda*
create_multi_UBspline_2d_z_cuda (multi_UBspline_2d_z* spline);
////////
// 3D //
////////
extern "C" multi_UBspline_3d_s_cuda*
create_multi_UBspline_3d_s_cuda (multi_UBspline_3d_s* spline);
extern "C" multi_UBspline_3d_s_cuda*
create_multi_UBspline_3d_s_cuda_conv (multi_UBspline_3d_d* spline);
extern "C" multi_UBspline_3d_c_cuda*
create_multi_UBspline_3d_c_cuda (multi_UBspline_3d_c* spline);
extern "C" multi_UBspline_3d_c_cuda*
create_multi_UBspline_3d_c_cuda_conv (multi_UBspline_3d_z* spline);
extern "C" multi_UBspline_3d_d_cuda*
create_multi_UBspline_3d_d_cuda (multi_UBspline_3d_d* spline);
extern "C" multi_UBspline_3d_z_cuda*
create_multi_UBspline_3d_z_cuda (multi_UBspline_3d_z* spline);
#endif

View File

@ -0,0 +1,640 @@
#define BLOCK_SIZE 64
#include "multi_bspline.h"
#include "multi_bspline_create_cuda.h"
//__constant__ float A[48];
// typedef struct
// {
// float *coefs_real, *coefs_imag;
// uint3 stride;
// float3 gridInv;
// int num_splines;
// } multi_UBspline_3d_c_cuda;
#ifndef NO_CUDA_MAIN
extern "C" multi_UBspline_3d_c_cuda*
create_multi_UBspline_3d_c_cuda (multi_UBspline_3d_c* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(A, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_c_cuda *cuda_spline =
(multi_UBspline_3d_c_cuda*) malloc (sizeof (multi_UBspline_3d_c_cuda*));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%BLOCK_SIZE) != 0)
N += 64 - (N%BLOCK_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
size_t size = Nx*Ny*Nz+N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs_real), size);
cudaMalloc((void**)&(cuda_spline->coefs_imag), size);
float *spline_buff = (float*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp].real();
}
cudaMemcpy(cuda_spline->coefs_real, spline_buff, size, cudaMemcpyHostToDevice);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp].imag();
}
cudaMemcpy(cuda_spline->coefs_imag, spline_buff, size, cudaMemcpyHostToDevice);
free(spline_buff);
return cuda_spline;
}
#endif
__global__ static void
eval_multi_multi_UBspline_3d_c_cuda (float *pos, float3 drInv,
const float *coefs_real, const float *coefs_imag,
float *vals[], uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64], coefs[2*BLOCK_SIZE];
// __shared__ float pos_s[BLOCK_SIZE];
// int ir1 = (ir >> 4)*64;
// int ir2 = (ir & 15)*4;
// pos_s[thr] = pos[ir1+thr];
// __syncthreads();
// float3 r;
// r.x = pos_s[ir2+0];
// r.y = pos_s[ir2+1];
// r.z = pos_s[ir2+2];
__shared__ float3 r;
if (thr == 0) {
r.x = pos[4*ir+0];
r.y = pos[4*ir+1];
r.z = pos[4*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
float val_real = 0.0;
float val_imag = 0.0;
val_real = val_imag = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++) {
val_real += abc[16*i+4*j+k] * base_real[off+k*strides.z];
val_imag += abc[16*i+4*j+k] * base_imag[off+k*strides.z];
}
}
}
// for (int i=0; i<4; i++) {
// for (int j=0; j<4; j++) {
// float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
// float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
// for (int k=0; k<4; k++) {
// coefs[thr] = base_real[(2*block+0)*BLOCK_SIZE+thr];
// coefs[thr+BLOCK_SIZE] = base_real[(2*block+1)*BLOCK_SIZE+thr];
// __syncthreads();
// val_real += abc[16*i+4*j+k] * coefs[2*thr+0];
// val_imag += abc[16*i+4*j+k] * coefs[2*thr+1];
// }
// }
// }
__shared__ float buff[2*BLOCK_SIZE];
buff[2*thr+0] = val_real;
buff[2*thr+1] = val_imag;
__syncthreads();
myval[off] = buff[thr];
myval[off+BLOCK_SIZE] = buff[thr+BLOCK_SIZE];
// myval[2*off+0] = val_real;
// myval[2*off+1] = val_imag;
//myval[off+BLOCK_SIZE] = val_imag;
//vals_real[ir][offset] = val_real;
//vals_imag[ir][offset] = val_imag;
}
__global__ static void
eval_multi_multi_UBspline_3d_c_vgh_cuda (float *pos, float3 drInv,
const float *coefs_real, const float *coefs_imag,
float *vals[], float *grads[], float *hess[],
uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*BLOCK_SIZE+thr;
__shared__ float *myval, *mygrad, *myhess;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[4*ir+0];
r.y = pos[4*ir+1];
r.z = pos[4*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[10*(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[10*(16*i+4*j+k)+1] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[10*(16*i+4*j+k)+2] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[10*(16*i+4*j+k)+3] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[10*(16*i+4*j+k)+4] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[10*(16*i+4*j+k)+5] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[10*(16*i+4*j+k)+6] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[10*(16*i+4*j+k)+7] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[10*(16*i+4*j+k)+8] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[10*(16*i+4*j+k)+9] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v_r = 0.0;
float v_i = 0.0;
float g0_r=0.0, g0_i=0.0, g1_r=0.0, g1_i=0.0, g2_r=0.0, g2_i=0.0,
h00_r=0.0, h00_i=0.0, h01_r=0.0, h01_i=0.0, h02_r=0.0, h02_i=0.0,
h11_r=0.0, h11_i=0.0, h12_r=0.0, h12_i=0.0, h22_r=0.0, h22_i=0.0;
int n = 0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
// float c0_r, c0_i, c1_r, c1_i, c2_r, c2_i, c3_r, c3_i;
// c0_r = base_real[off+0*strides.z]; c0_i = base_imag[off+0*strides.z];
// c1_r = base_real[off+1*strides.z]; c1_i = base_imag[off+1*strides.z];
// c2_r = base_real[off+2*strides.z]; c2_i = base_imag[off+2*strides.z];
// c3_r = base_real[off+3*strides.z]; c3_i = base_imag[off+3*strides.z];
// v_r += abc[n+0] * c0_r; v_i += abc[n+0] * c0_i;
// g0_r += abc[n+1] * c0_r; g0_i += abc[n+1] * c0_i;
// g1_r += abc[n+2] * c0_r; g1_i += abc[n+2] * c0_i;
// g2_r += abc[n+3] * c0_r; g2_i += abc[n+3] * c0_i;
// h00_r += abc[n+4] * c0_r; h00_i += abc[n+4] * c0_i;
// h01_r += abc[n+5] * c0_r; h01_i += abc[n+5] * c0_i;
// h02_r += abc[n+6] * c0_r; h02_i += abc[n+6] * c0_i;
// h11_r += abc[n+7] * c0_r; h11_i += abc[n+7] * c0_i;
// h12_r += abc[n+8] * c0_r; h12_i += abc[n+8] * c0_i;
// h22_r += abc[n+9] * c0_r; h22_i += abc[n+9] * c0_i;
// v_r += abc[n+10] * c1_r; v_i += abc[n+10] * c1_i;
// g0_r += abc[n+11] * c1_r; g0_i += abc[n+11] * c1_i;
// g1_r += abc[n+12] * c1_r; g1_i += abc[n+12] * c1_i;
// g2_r += abc[n+13] * c1_r; g2_i += abc[n+13] * c1_i;
// h00_r += abc[n+14] * c1_r; h00_i += abc[n+14] * c1_i;
// h01_r += abc[n+15] * c1_r; h01_i += abc[n+15] * c1_i;
// h02_r += abc[n+16] * c1_r; h02_i += abc[n+16] * c1_i;
// h11_r += abc[n+17] * c1_r; h11_i += abc[n+17] * c1_i;
// h12_r += abc[n+18] * c1_r; h12_i += abc[n+18] * c1_i;
// h22_r += abc[n+19] * c1_r; h22_i += abc[n+19] * c1_i;
// v_r += abc[n+20] * c2_r; v_i += abc[n+20] * c2_i;
// g0_r += abc[n+21] * c2_r; g0_i += abc[n+21] * c2_i;
// g1_r += abc[n+22] * c2_r; g1_i += abc[n+22] * c2_i;
// g2_r += abc[n+23] * c2_r; g2_i += abc[n+23] * c2_i;
// h00_r += abc[n+24] * c2_r; h00_i += abc[n+24] * c2_i;
// h01_r += abc[n+25] * c2_r; h01_i += abc[n+25] * c2_i;
// h02_r += abc[n+26] * c2_r; h02_i += abc[n+26] * c2_i;
// h11_r += abc[n+27] * c2_r; h11_i += abc[n+27] * c2_i;
// h12_r += abc[n+28] * c2_r; h12_i += abc[n+28] * c2_i;
// h22_r += abc[n+29] * c2_r; h22_i += abc[n+29] * c2_i;
// v_r += abc[n+30] * c3_r; v_i += abc[n+30] * c3_i;
// g0_r += abc[n+31] * c3_r; g0_i += abc[n+31] * c3_i;
// g1_r += abc[n+32] * c3_r; g1_i += abc[n+32] * c3_i;
// g2_r += abc[n+33] * c3_r; g2_i += abc[n+33] * c3_i;
// h00_r += abc[n+34] * c3_r; h00_i += abc[n+34] * c3_i;
// h01_r += abc[n+35] * c3_r; h01_i += abc[n+35] * c3_i;
// h02_r += abc[n+36] * c3_r; h02_i += abc[n+36] * c3_i;
// h11_r += abc[n+37] * c3_r; h11_i += abc[n+37] * c3_i;
// h12_r += abc[n+38] * c3_r; h12_i += abc[n+38] * c3_i;
// h22_r += abc[n+39] * c3_r; h22_i += abc[n+39] * c3_i;
// n += 40;
for (int k=0; k<4; k++) {
float cr = base_real[off+k*strides.z];
float ci = base_imag[off+k*strides.z];
v_r += abc[n+0] * cr; v_i += abc[n+0] * ci;
g0_r += abc[n+1] * cr; g0_i += abc[n+1] * ci;
g1_r += abc[n+2] * cr; g1_i += abc[n+2] * ci;
g2_r += abc[n+3] * cr; g2_i += abc[n+3] * ci;
h00_r += abc[n+4] * cr; h00_i += abc[n+4] * ci;
h01_r += abc[n+5] * cr; h01_i += abc[n+5] * ci;
h02_r += abc[n+6] * cr; h02_i += abc[n+6] * ci;
h11_r += abc[n+7] * cr; h11_i += abc[n+7] * ci;
h12_r += abc[n+8] * cr; h12_i += abc[n+8] * ci;
h22_r += abc[n+9] * cr; h22_i += abc[n+9] * ci;
n += 10;
}
}
}
g0_r *= drInv.x; g0_i *= drInv.x;
g1_r *= drInv.y; g1_i *= drInv.y;
g2_r *= drInv.z; g2_i *= drInv.z;
h00_r *= drInv.x * drInv.x; h00_i *= drInv.x * drInv.x;
h01_r *= drInv.x * drInv.y; h01_i *= drInv.x * drInv.y;
h02_r *= drInv.x * drInv.z; h02_i *= drInv.x * drInv.z;
h11_r *= drInv.y * drInv.y; h11_i *= drInv.y * drInv.y;
h12_r *= drInv.y * drInv.z; h12_i *= drInv.y * drInv.z;
h22_r *= drInv.z * drInv.z; h22_i *= drInv.z * drInv.z;
__shared__ float buff[6*BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
buff[2*thr+0] = v_r; buff[2*thr+1] = v_i;
__syncthreads();
myval[off] = buff[thr];
myval[off+BLOCK_SIZE] = buff[thr+BLOCK_SIZE];
buff[6*thr+0] = g0_r; buff[6*thr+1] = g0_i;
buff[6*thr+2] = g1_r; buff[6*thr+3] = g1_i;
buff[6*thr+4] = g2_r; buff[6*thr+5] = g2_i;
__syncthreads();
for (int i=0; i<6; i++)
mygrad[(6*block+i)*BLOCK_SIZE+thr] = buff[i*BLOCK_SIZE+thr];
__syncthreads();
// Write first half of Hessians
if (thr < 32) {
buff[12*thr+0] = h00_r; buff[12*thr+1] = h00_i;
buff[12*thr+2] = h01_r; buff[12*thr+3] = h01_i;
buff[12*thr+4] = h02_r; buff[12*thr+5] = h02_i;
buff[12*thr+6] = h11_r; buff[12*thr+7] = h11_i;
buff[12*thr+8] = h12_r; buff[12*thr+9] = h12_i;
buff[12*thr+10] = h22_r; buff[12*thr+11] = h22_i;
}
__syncthreads();
if (thr < 32)
for (int i=0; i<6; i++)
myhess[(12*block+i)*BLOCK_SIZE+thr] = buff[i*BLOCK_SIZE+thr];
__syncthreads();
int th2 = thr-32;
if (thr >= 32) {
buff[12*th2+0] = h00_r; buff[12*th2+1] = h00_i;
buff[12*th2+2] = h01_r; buff[12*th2+3] = h01_i;
buff[12*th2+4] = h02_r; buff[12*th2+5] = h02_i;
buff[12*th2+6] = h11_r; buff[12*th2+7] = h11_i;
buff[12*th2+8] = h12_r; buff[12*th2+9] = h12_i;
buff[12*th2+10] = h22_r; buff[12*th2+11] = h22_i;
}
__syncthreads();
if (thr >= 32) {
for (int i=0; i<6; i++)
myhess[(12*block+i+6)*BLOCK_SIZE+th2] = buff[i*BLOCK_SIZE+th2];
}
}
#ifndef NO_CUDA_MAIN
static void *
test_multi_cuda(void *thread)
{
// CUcontext ctx;
// CUdevice dev;
// cuDeviceGet (&dev, (int)(size_t)thread);
// cuCtxCreate(&ctx, CU_CTX_SCHED_YIELD, dev);
// int deviceCount;
// cudaGetDeviceCount(&deviceCount);
cudaSetDevice((int)(size_t)thread);
fprintf (stderr, "In thread %p\n", thread);
int numWalkers = 200;
float *coefs , __device__ *vals[numWalkers], *grads[numWalkers], *hess[numWalkers];
float *coefs_real_d, *coefs_imag_d, __device__ **vals_d, **grads_d, **hess_d;
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
// Copy A to host
cudaMemcpy(Acuda, A_h, 48*sizeof(float), cudaMemcpyHostToDevice);
float *r_d, *r_h;
int xs, ys, zs, N;
int Nx, Ny, Nz;
N = 128;
Nx = Ny = Nz = 16;
xs = Ny*Nz*N;
ys = Nz*N;
zs = N;
float3 drInv;
drInv.x = 1.0/float(Nx);
drInv.y = 1.0/float(Ny);
drInv.z = 1.0/float(Nz);
// Setup Bspline coefficients
int size = Nx*Ny*Nz*N*sizeof(float);
posix_memalign((void**)&coefs, 16, size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int n=0; n<N; n++)
coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
fprintf (stderr, "Filled in coefs.\n");
// Setup values
//posix_memalign((void**)&vals, 16, N*sizeof(float));
// cudaMemcpy(r_d, r, numWalkers*sizeof(float3), cudaMemcpyHostToDevice);
fprintf (stderr, "size = %d\n", size);
// Setup CUDA coefficients
fprintf (stderr, "Before first CUDA mallocs.\n");
cudaMalloc((void**)&coefs_real_d, 2*size);
cudaMalloc((void**)&coefs_imag_d, 2*size);
fprintf (stderr, "Before Memcpy.\n");
cudaMemcpy(coefs_real_d, coefs, size, cudaMemcpyHostToDevice);
cudaMemcpy(coefs_imag_d, coefs, size, cudaMemcpyHostToDevice);
fprintf (stderr, "After Memcpy.\n");
// Setup device value storage
int numVals = 2*N*numWalkers*10;
float *valBlock_d, *valBlock_h;
cudaMalloc((void**)&(valBlock_d), numVals*sizeof(float));
cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float));
cudaMalloc((void**)&(vals_d), 2*numWalkers*sizeof(float*));
cudaMalloc((void**)&(grads_d), 2*numWalkers*sizeof(float*));
cudaMalloc((void**)&(hess_d), 2*numWalkers*sizeof(float*));
fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
for (int i=0; i<numWalkers; i++) {
vals[i] = valBlock_d + 2*i*N;
grads[i] = valBlock_d + 2*N*numWalkers + 6*i*N;
hess[i] = valBlock_d + 8*N*numWalkers + 12*i*N;
}
cudaMemcpy(vals_d, vals, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(grads_d, grads, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(hess_d, hess, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
fprintf (stderr, "Finished cuda allocations.\n");
// Setup walker positions
cudaMalloc((void**)&(r_d), 4*numWalkers*sizeof(float));
cudaMallocHost((void**)&(r_h), 4*numWalkers*sizeof(float));
for (int ir=0; ir<numWalkers; ir++) {
r_h[4*ir+0] = 0.5*drand48();
r_h[4*ir+1] = 0.5*drand48();
r_h[4*ir+2] = 0.5*drand48();
}
uint3 strides;
strides.x = xs;
strides.y = ys;
strides.z = zs;
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid(N/BLOCK_SIZE,numWalkers);
clock_t start, end;
start = clock();
for (int i=0; i<10000; i++) {
if ((i%1000) == 0)
fprintf (stderr, "i = %d\n", i);
cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
eval_multi_multi_UBspline_3d_c_cuda<<<dimGrid,dimBlock>>>
(r_d, drInv, coefs_real_d, coefs_imag_d,
vals_d, strides);
// eval_multi_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>>
// (r_d, drInv, coefs_real_d, coefs_imag_d,
// valBlock_d, valBlock_d+numVals/2, strides);
//cudaMemcpy(valBlock_h, valBlock_d, numVals*sizeof(float), cudaMemcpyDeviceToHost);
}
end = clock();
double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
fprintf (stderr, "VGH evals per second = %1.8e\n", 1.0/time);
start = clock();
for (int i=0; i<10000; i++) {
if ((i%1000) == 0)
fprintf (stderr, "i = %d\n", i);
cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
eval_multi_multi_UBspline_3d_c_vgh_cuda<<<dimGrid,dimBlock>>>
(r_d, drInv, coefs_real_d, coefs_imag_d,
vals_d, grads_d, hess_d, strides);
}
end = clock();
time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
cudaFree (valBlock_d);
cudaFree (vals_d);
cudaFree (coefs_real_d);
cudaFree (coefs_imag_d);
cudaFree (r_d);
return NULL;
// cudaMemcpy (vals, vals_d, N*sizeof(float), cudaMemcpyDeviceToHost);
// float vals2[N];
// for (int n=0; n<N; n++) {
// vals2[n] = 0.0;
// int index=0;
// for(int i=0; i<4; i++)
// for (int j=0; j<4; j++)
// for (int k=0; k<4; k++) {
// vals2[n] += abc[index] * coefs[(ix+i)*xs+(iy+j)*ys+(iz+k)*zs+n];
// index++;
// }
// }
// for (int i=0; i<N/256; i++)
// fprintf (stderr, "%1.9f %1.9f\n", vals[i], vals2[i]);
// cudaFree(abc_d);
// cudaFree(coefs_d);
// cudaFree(vals_d);
}
#endif
#ifndef NO_CUDA_MAIN
main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
fprintf (stderr, "Detected %d CUDA devices.\n", deviceCount);
// test_cuda();
for (int device = 0; device < deviceCount; ++device) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
fprintf (stderr, "Device %d:\n", device);
fprintf (stderr, " Global memory: %10d\n",
deviceProp.totalGlobalMem);
fprintf (stderr, " MultiProcessors: %10d\n",
deviceProp.multiProcessorCount);
fprintf (stderr, " Registers: %10d\n",
deviceProp.regsPerBlock);
fprintf (stderr, " Constant memory: %10d\n",
deviceProp.totalConstMem);
fprintf (stderr, " Shared memory: %10d\n",
deviceProp.sharedMemPerBlock);
}
// pthread_t threads[deviceCount];
// for (int device = 0; device < deviceCount; device++)
// pthread_create (&(threads[device]), NULL, test_multi_cuda, (void*)device);
// cutStartThread((CUT_THREADROUTINE)test_multi_cuda,(void*)device);
test_multi_cuda((void*)0);
// pthread_exit(NULL);
//test_multi_cuda();
}
#endif

View File

@ -0,0 +1,924 @@
#ifndef MULTI_BSPLINE_CUDA_C_IMPL_H
#define MULTI_BSPLINE_CUDA_C_IMPL_H
#include "multi_bspline.h"
#include "multi_bspline_create_cuda.h"
__global__ static void
eval_multi_multi_UBspline_1d_c_kernel
(float *pos, float drInv, const float *coefs, float **vals,
uint dim, uint stride, int N)
{
int tid = threadIdx.x;
int ir = blockIdx.x;
__shared__ float *ourval;
__shared__ float r;
if (tid == 0) {
r = pos[ir];
ourval = vals[ir];
}
__syncthreads();
int index;
float t;
float s, sf;
float4 tp;
s = r * drInv;
sf = floor(s);
index = min(max(0,(int)sf), dim-1);
t = s - sf;
tp = make_float4(t*t*t, t*t, t, 1.0);
__shared__ float a[4];
if (tid < 4)
a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
__syncthreads();
int numBlocks = 2*N / SPLINE_BLOCK_SIZE;
const float *c = coefs + index*stride + tid;
float *myval = ourval + tid;
int stride2 = 2*stride;
int stride3 = 3*stride;
for (int block=0; block < numBlocks; block++) {
*myval = (a[0] * c[0] +
a[1] * c[stride] +
a[2] * c[stride2] +
a[3] * c[stride3]);
myval += SPLINE_BLOCK_SIZE; c += SPLINE_BLOCK_SIZE;
}
int remainder = 2*N - numBlocks*SPLINE_BLOCK_SIZE;
if (tid < remainder) {
*myval = (a[0] * c[0] +
a[1] * c[stride] +
a[2] * c[stride2] +
a[3] * c[stride3]);
}
}
extern "C" void
eval_multi_multi_UBspline_1d_c_cuda (const multi_UBspline_1d_c_cuda *spline,
float *pos_d, float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(num);
eval_multi_multi_UBspline_1d_c_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (float*)spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_c_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_1d_c_vgl_kernel
(float *pos, float drInv, const float *coefs, float **vals,
float **grads, float **lapl,
uint dim, uint stride, int N)
{
int tid = threadIdx.x;
int ir = blockIdx.x;
__shared__ float *ourval, *ourgrad, *ourlapl;
__shared__ float r;
if (tid == 0) {
r = pos[ir];
ourval = vals[ir];
ourgrad = grads[ir];
ourlapl = lapl[ir];
}
__syncthreads();
int index;
float t;
float s, sf;
float4 tp;
s = r * drInv;
sf = floor(s);
index = min(max(0,(int)sf), dim-1);
t = s - sf;
tp = make_float4(t*t*t, t*t, t, 1.0);
__shared__ float a[12];
if (tid < 12)
a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
__syncthreads();
int numBlocks = 2*N / SPLINE_BLOCK_SIZE;
const float *c = coefs + index*stride + tid;
float *myval = ourval + tid;
float *mygrad = ourgrad + tid;
float *mylapl = ourlapl + tid;
int stride2 = 2*stride;
int stride3 = 3*stride;
__shared__ float coef[SPLINE_BLOCK_SIZE][5];
for (int block=0; block < numBlocks; block++) {
coef[tid][0] = c[0];
coef[tid][1] = c[stride];
coef[tid][2] = c[stride2];
coef[tid][3] = c[stride3];
*myval = (a[0] * coef[tid][0] + a[1] * coef[tid][1] +
a[2] * coef[tid][2] + a[3] * coef[tid][3]);
*mygrad = (a[4] * coef[tid][0] + a[5] * coef[tid][1] +
a[6] * coef[tid][2] + a[7] * coef[tid][3]);
*mylapl = (a[8] * coef[tid][0] + a[9] * coef[tid][1] +
a[10] * coef[tid][2] + a[11]* coef[tid][3]);
myval += SPLINE_BLOCK_SIZE;
mygrad += SPLINE_BLOCK_SIZE;
mylapl += SPLINE_BLOCK_SIZE;
c += SPLINE_BLOCK_SIZE;
}
int remainder = 2*N - numBlocks*SPLINE_BLOCK_SIZE;
if (tid < remainder) {
*myval = (a[0] * c[0] +
a[1] * c[stride] +
a[2] * c[stride2] +
a[3] * c[stride3]);
}
}
extern "C" void
eval_multi_multi_UBspline_1d_c_vgl_cuda (const multi_UBspline_1d_c_cuda *spline,
float *pos_d, float *vals_d[],
float *grads_d[], float *lapl_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(num);
eval_multi_multi_UBspline_1d_c_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (float*)spline->coefs, vals_d, grads_d, lapl_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_c_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_3d_c_kernel
(float *pos, float3 drInv, const float *coefs, float *vals[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
//index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
//index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
//index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
if (off < 2*N) {
float val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = val;
}
}
__global__ static void
eval_multi_multi_UBspline_3d_c_vgh_kernel
(float *pos, float3 drInv, const float *coefs,
float *vals[], float *grads[], float *hess[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad, *myhess;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < 2*N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const float *base = b0 + i*strides.x + j*strides.y;
float c0 = base[0*strides.z];
float c1 = base[1*strides.z];
float c2 = base[2*strides.z];
float c3 = base[3*strides.z];
v += abc[n+ 0]*c0 + abc[n+ 1]*c1 + abc[n+ 2]*c2 + abc[n+ 3]*c3;
g0 += abc[n+ 64]*c0 + abc[n+ 65]*c1 + abc[n+ 66]*c2 + abc[n+ 67]*c3;
g1 += abc[n+128]*c0 + abc[n+129]*c1 + abc[n+130]*c2 + abc[n+131]*c3;
g2 += abc[n+192]*c0 + abc[n+193]*c1 + abc[n+194]*c2 + abc[n+195]*c3;
h00 += abc[n+256]*c0 + abc[n+257]*c1 + abc[n+258]*c2 + abc[n+259]*c3;
h01 += abc[n+320]*c0 + abc[n+321]*c1 + abc[n+322]*c2 + abc[n+323]*c3;
h02 += abc[n+384]*c0 + abc[n+385]*c1 + abc[n+386]*c2 + abc[n+387]*c3;
h11 += abc[n+448]*c0 + abc[n+449]*c1 + abc[n+450]*c2 + abc[n+451]*c3;
h12 += abc[n+512]*c0 + abc[n+513]*c1 + abc[n+514]*c2 + abc[n+515]*c3;
h22 += abc[n+576]*c0 + abc[n+577]*c1 + abc[n+578]*c2 + abc[n+579]*c3;
n += 4;
// for (int k=0; k<4; k++) {
// float c = base[k*strides.z];
// v += abc[n+0] * c;
// g0 += abc[n+64] * c;
// g1 += abc[n+128] * c;
// g2 += abc[n+192] * c;
// h00 += abc[n+256] * c;
// h01 += abc[n+320] * c;
// h02 += abc[n+384] * c;
// h11 += abc[n+448] * c;
// h12 += abc[n+512] * c;
// h22 += abc[n+576] * c;
// n += 1;
// }
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
abc[3*thr+0] = g0;
abc[3*thr+1] = g1;
abc[3*thr+2] = g2;
__syncthreads();
for (int i=0; i<3; i++) {
int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 6*N)
mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
__syncthreads();
// Write Hessians
abc[6*thr+0] = h00;
abc[6*thr+1] = h01;
abc[6*thr+2] = h02;
abc[6*thr+3] = h11;
abc[6*thr+4] = h12;
abc[6*thr+5] = h22;
__syncthreads();
for (int i=0; i<6; i++) {
int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 12*N)
myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
}
extern "C" void
eval_multi_multi_UBspline_3d_c_cuda (const multi_UBspline_3d_c_cuda *spline,
float *pos_d, complex_float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_c_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (float*)spline->coefs, (float**)vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_c_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
extern "C" void
eval_multi_multi_UBspline_3d_c_vgh_cuda (const multi_UBspline_3d_c_cuda *spline,
float *pos_d, complex_float *vals_d[], complex_float *grads_d[],
complex_float *hess_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if ((2*spline->num_splines) % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_c_vgh_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (float*)spline->coefs,
(float**)vals_d, (float**)grads_d, (float**)hess_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_c_vgh_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_3d_c_vgl_kernel
(float *pos, float3 drInv, const float *coefs, float Linv[],
float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad_lapl;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad_lapl = grad_lapl[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < 2*N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const float *base = b0 + i*strides.x + j*strides.y;
float c0 = base[0*strides.z];
float c1 = base[1*strides.z];
float c2 = base[2*strides.z];
float c3 = base[3*strides.z];
v += abc[n+ 0]*c0 + abc[n+ 1]*c1 + abc[n+ 2]*c2 + abc[n+ 3]*c3;
g0 += abc[n+ 64]*c0 + abc[n+ 65]*c1 + abc[n+ 66]*c2 + abc[n+ 67]*c3;
g1 += abc[n+128]*c0 + abc[n+129]*c1 + abc[n+130]*c2 + abc[n+131]*c3;
g2 += abc[n+192]*c0 + abc[n+193]*c1 + abc[n+194]*c2 + abc[n+195]*c3;
h00 += abc[n+256]*c0 + abc[n+257]*c1 + abc[n+258]*c2 + abc[n+259]*c3;
h01 += abc[n+320]*c0 + abc[n+321]*c1 + abc[n+322]*c2 + abc[n+323]*c3;
h02 += abc[n+384]*c0 + abc[n+385]*c1 + abc[n+386]*c2 + abc[n+387]*c3;
h11 += abc[n+448]*c0 + abc[n+449]*c1 + abc[n+450]*c2 + abc[n+451]*c3;
h12 += abc[n+512]*c0 + abc[n+513]*c1 + abc[n+514]*c2 + abc[n+515]*c3;
h22 += abc[n+576]*c0 + abc[n+577]*c1 + abc[n+578]*c2 + abc[n+579]*c3;
n += 4;
// for (int k=0; k<4; k++) {
// float c = base[k*strides.z];
// v += abc[n+ 0] * c;
// g0 += abc[n+ 64] * c;
// g1 += abc[n+128] * c;
// g2 += abc[n+192] * c;
// h00 += abc[n+256] * c;
// h01 += abc[n+320] * c;
// h02 += abc[n+384] * c;
// h11 += abc[n+448] * c;
// h12 += abc[n+512] * c;
// h22 += abc[n+576] * c;
// n += 1;
// }
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
__shared__ float G[3][3], GGt[3][3];
int i0 = threadIdx.x/3;
int i1 = threadIdx.x - 3*i0;
if (threadIdx.x < 9)
G[i0][i1] = Linv[threadIdx.x];
__syncthreads();
if (threadIdx.x < 9)
GGt[i0][i1] = (G[0][i0]*G[0][i1] +
G[1][i0]*G[1][i1] +
G[2][i0]*G[2][i1]);
__syncthreads();
if (off < 2*N) {
// Store gradients back to global memory
mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
mygrad_lapl[off+2*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
mygrad_lapl[off+4*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
// Store laplacians back to global memory
// Hessian = H00 H01 H02 H11 H12 H22
// Matrix = [0 1 2]
// [1 3 4]
// [2 4 5]
// laplacian = Trace(GGt*Hessian)
mygrad_lapl[off+6*row_stride] =
(GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
}
}
extern "C" void
eval_multi_multi_UBspline_3d_c_vgl_cuda
(const multi_UBspline_3d_c_cuda *spline, float *pos_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if ((2*spline->num_splines) % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_c_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (float*)spline->coefs, Linv_d, (float**)vals_d,
(float**)grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_c_vgl_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
/*
__global__ static void
eval_multi_multi_UBspline_3d_c_cuda (float *pos, float3 drInv,
float *coefs_real, float *coefs_imag,
float *vals[], uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
// __shared__ float pos_s[SPLINE_BLOCK_SIZE];
// int ir1 = (ir >> 4)*64;
// int ir2 = (ir & 15)*4;
// pos_s[thr] = pos[ir1+thr];
// __syncthreads();
// float3 r;
// r.x = pos_s[ir2+0];
// r.y = pos_s[ir2+1];
// r.z = pos_s[ir2+2];
__shared__ float3 r;
if (thr == 0) {
r.x = pos[4*ir+0];
r.y = pos[4*ir+1];
r.z = pos[4*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
float val_real = 0.0;
float val_imag = 0.0;
val_real = val_imag = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++) {
val_real += abc[16*i+4*j+k] * base_real[off+k*strides.z];
val_imag += abc[16*i+4*j+k] * base_imag[off+k*strides.z];
}
}
}
__shared__ float buff[2*SPLINE_BLOCK_SIZE];
buff[2*thr+0] = val_real;
buff[2*thr+1] = val_imag;
__syncthreads();
myval[off] = buff[thr];
myval[off+SPLINE_BLOCK_SIZE] = buff[thr+SPLINE_BLOCK_SIZE];
}
__global__ static void
eval_multi_multi_UBspline_3d_c_vgh_cuda (float *pos, float3 drInv,
float *coefs_real, float *coefs_imag,
float *vals[], float *grads[],
float *hess[], uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval, *mygrad, *myhess;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[4*ir+0];
r.y = pos[4*ir+1];
r.z = pos[4*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[10*(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[10*(16*i+4*j+k)+1] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[10*(16*i+4*j+k)+2] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[10*(16*i+4*j+k)+3] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[10*(16*i+4*j+k)+4] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[10*(16*i+4*j+k)+5] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[10*(16*i+4*j+k)+6] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[10*(16*i+4*j+k)+7] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[10*(16*i+4*j+k)+8] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[10*(16*i+4*j+k)+9] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v_r = 0.0;
float v_i = 0.0;
float g0_r=0.0, g0_i=0.0, g1_r=0.0, g1_i=0.0, g2_r=0.0, g2_i=0.0,
h00_r=0.0, h00_i=0.0, h01_r=0.0, h01_i=0.0, h02_r=0.0, h02_i=0.0,
h11_r=0.0, h11_i=0.0, h12_r=0.0, h12_i=0.0, h22_r=0.0, h22_i=0.0;
int n = 0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++) {
float cr = base_real[off+k*strides.z];
float ci = base_imag[off+k*strides.z];
v_r += abc[n+0] * cr; v_i += abc[n+0] * ci;
g0_r += abc[n+1] * cr; g0_i += abc[n+1] * ci;
g1_r += abc[n+2] * cr; g1_i += abc[n+2] * ci;
g2_r += abc[n+3] * cr; g2_i += abc[n+3] * ci;
h00_r += abc[n+4] * cr; h00_i += abc[n+4] * ci;
h01_r += abc[n+5] * cr; h01_i += abc[n+5] * ci;
h02_r += abc[n+6] * cr; h02_i += abc[n+6] * ci;
h11_r += abc[n+7] * cr; h11_i += abc[n+7] * ci;
h12_r += abc[n+8] * cr; h12_i += abc[n+8] * ci;
h22_r += abc[n+9] * cr; h22_i += abc[n+9] * ci;
n += 10;
}
}
}
g0_r *= drInv.x; g0_i *= drInv.x;
g1_r *= drInv.y; g1_i *= drInv.y;
g2_r *= drInv.z; g2_i *= drInv.z;
h00_r *= drInv.x * drInv.x; h00_i *= drInv.x * drInv.x;
h01_r *= drInv.x * drInv.y; h01_i *= drInv.x * drInv.y;
h02_r *= drInv.x * drInv.z; h02_i *= drInv.x * drInv.z;
h11_r *= drInv.y * drInv.y; h11_i *= drInv.y * drInv.y;
h12_r *= drInv.y * drInv.z; h12_i *= drInv.y * drInv.z;
h22_r *= drInv.z * drInv.z; h22_i *= drInv.z * drInv.z;
__shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
buff[2*thr+0] = v_r; buff[2*thr+1] = v_i;
__syncthreads();
myval[off] = buff[thr];
myval[off+SPLINE_BLOCK_SIZE] = buff[thr+SPLINE_BLOCK_SIZE];
buff[6*thr+0] = g0_r; buff[6*thr+1] = g0_i;
buff[6*thr+2] = g1_r; buff[6*thr+3] = g1_i;
buff[6*thr+4] = g2_r; buff[6*thr+5] = g2_i;
__syncthreads();
for (int i=0; i<6; i++)
mygrad[(6*block+i)*SPLINE_BLOCK_SIZE+thr] = buff[i*SPLINE_BLOCK_SIZE+thr];
__syncthreads();
// Write first half of Hessians
if (thr < 32) {
buff[12*thr+0] = h00_r; buff[12*thr+1] = h00_i;
buff[12*thr+2] = h01_r; buff[12*thr+3] = h01_i;
buff[12*thr+4] = h02_r; buff[12*thr+5] = h02_i;
buff[12*thr+6] = h11_r; buff[12*thr+7] = h11_i;
buff[12*thr+8] = h12_r; buff[12*thr+9] = h12_i;
buff[12*thr+10] = h22_r; buff[12*thr+11] = h22_i;
}
__syncthreads();
if (thr < 32)
for (int i=0; i<6; i++)
myhess[(12*block+i)*SPLINE_BLOCK_SIZE+thr] = buff[i*SPLINE_BLOCK_SIZE+thr];
__syncthreads();
int th2 = thr-32;
if (thr >= 32) {
buff[12*th2+0] = h00_r; buff[12*th2+1] = h00_i;
buff[12*th2+2] = h01_r; buff[12*th2+3] = h01_i;
buff[12*th2+4] = h02_r; buff[12*th2+5] = h02_i;
buff[12*th2+6] = h11_r; buff[12*th2+7] = h11_i;
buff[12*th2+8] = h12_r; buff[12*th2+9] = h12_i;
buff[12*th2+10] = h22_r; buff[12*th2+11] = h22_i;
}
__syncthreads();
if (thr >= 32) {
for (int i=0; i<6; i++)
myhess[(12*block+i+6)*SPLINE_BLOCK_SIZE+th2] = buff[i*SPLINE_BLOCK_SIZE+th2];
}
}
*/
#endif

View File

@ -0,0 +1,453 @@
#ifndef MULTI_BSPLINE_CUDA_D_IMPL_H
#define MULTI_BSPLINE_CUDA_D_IMPL_H
#include "multi_bspline.h"
#include "multi_bspline_create_cuda.h"
__global__ static void
eval_multi_multi_UBspline_3d_d_kernel
(double *pos, double3 drInv, const double *coefs, double *vals[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ double *myval;
__shared__ double abc[64];
__shared__ double3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
double3 t;
double s, sf;
double4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0].x =t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
tp[1].x =t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
tp[2].x =t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
__shared__ double a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
if (off < N) {
double val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const double *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = val;
}
}
__global__ static void
eval_multi_multi_UBspline_3d_d_vgh_kernel
(double *pos, double3 drInv, const double *coefs,
double *vals[], double *grads[], double *hess[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad, *myhess;
__shared__ double3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
double3 t;
double s, sf;
double4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0].x =t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
tp[1].x =t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
tp[2].x =t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ double a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ double abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
double v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const double *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
double c = base[k*strides.z];
v += abc[n+0] * c;
g0 += abc[n+64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ double buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
abc[3*thr+0] = g0;
abc[3*thr+1] = g1;
abc[3*thr+2] = g2;
__syncthreads();
for (int i=0; i<3; i++) {
int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 3*N)
mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
__syncthreads();
// Write Hessians
abc[6*thr+0] = h00;
abc[6*thr+1] = h01;
abc[6*thr+2] = h02;
abc[6*thr+3] = h11;
abc[6*thr+4] = h12;
abc[6*thr+5] = h22;
__syncthreads();
for (int i=0; i<6; i++) {
int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 6*N)
myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
}
extern "C" void
eval_multi_multi_UBspline_3d_d_cuda (const multi_UBspline_3d_d_cuda *spline,
double *pos_d, double *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_d_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_d_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
extern "C" void
eval_multi_multi_UBspline_3d_d_vgh_cuda (const multi_UBspline_3d_d_cuda *spline,
double *pos_d, double *vals_d[], double *grads_d[],
double *hess_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_d_vgh_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, hess_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_d_vgh_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_3d_d_vgl_kernel
(double *pos, double3 drInv, double *coefs, double Linv[],
double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad_lapl;
__shared__ double3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad_lapl = grad_lapl[ir];
}
__syncthreads();
int3 index;
double3 t;
double s, sf;
double4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0].x =t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
tp[1].x =t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
tp[2].x =t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ double a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ double abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
double v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
double *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
double c = base[k*strides.z];
v += abc[n+ 0] * c;
g0 += abc[n+ 64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ double buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
__shared__ double G[3][3], GGt[3][3];
int i0 = threadIdx.x/3;
int i1 = threadIdx.x - 3*i0;
if (threadIdx.x < 9)
G[i0][i1] = Linv[threadIdx.x];
__syncthreads();
if (threadIdx.x < 9)
GGt[i0][i1] = (G[0][i0]*G[0][i1] +
G[1][i0]*G[1][i1] +
G[2][i0]*G[2][i1]);
__syncthreads();
if (off < N) {
// Store gradients back to global memory
mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
mygrad_lapl[off+1*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
mygrad_lapl[off+2*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
// Store laplacians back to global memory
// Hessian = H00 H01 H02 H11 H12 H22
// Matrix = [0 1 2]
// [1 3 4]
// [2 4 5]
// laplacian = Trace(GGt*Hessian)
mygrad_lapl[off+3*row_stride] =
(GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
}
}
extern "C" void
eval_multi_multi_UBspline_3d_d_vgl_cuda
(const multi_UBspline_3d_d_cuda *spline, double *pos_d, double *Linv_d,
double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_d_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_d_vgl_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
#endif

View File

@ -0,0 +1,594 @@
#include "multi_bspline.h"
#include "multi_bspline_create_cuda.h"
#ifndef NO_CUDA_MAIN
__constant__ float Acuda[48];
#endif
// typedef struct
// {
// float *coefs;
// uint3 stride;
// float3 gridInv;
// int num_splines;
// } multi_UBspline_3d_s_cuda;
#ifndef NO_CUDA_MAIN
multi_UBspline_3d_s_cuda*
create_multi_UBspline_3d_s_cuda (multi_UBspline_3d_s* spline)
{
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
multi_UBspline_3d_s_cuda *cuda_spline =
(multi_UBspline_3d_s_cuda*) malloc (sizeof (multi_UBspline_3d_s_cuda*));
cuda_spline->num_splines = spline->num_splines;
int Nx = spline->x_grid.num+3;
int Ny = spline->y_grid.num+3;
int Nz = spline->z_grid.num+3;
int N = spline->num_splines;
if ((N%SPLINE_BLOCK_SIZE) != 0)
N += 64 - (N%SPLINE_BLOCK_SIZE);
cuda_spline->stride.x = Ny*Nz*N;
cuda_spline->stride.y = Nz*N;
cuda_spline->stride.z = N;
cuda_spline->gridInv.x = spline->x_grid.delta_inv;
cuda_spline->gridInv.y = spline->y_grid.delta_inv;
cuda_spline->gridInv.z = spline->z_grid.delta_inv;
size_t size = Nx*Ny*Nz*N*sizeof(float);
cudaMalloc((void**)&(cuda_spline->coefs), size);
float *spline_buff = (float*)malloc(size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int isp=0; isp<spline->num_splines; isp++) {
spline_buff[ix*cuda_spline->stride.x +
iy*cuda_spline->stride.y +
iz*cuda_spline->stride.z + isp] =
spline->coefs[ix*spline->x_stride +
iy*spline->y_stride +
iz*spline->z_stride + isp];
}
cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
//free(spline_buff);
return cuda_spline;
}
#endif
__global__ static void
eval_multi_multi_UBspline_3d_s_kernel
(float *pos, float3 drInv, const float *coefs, float *vals[], uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
float val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = val;
}
__global__ static void
eval_multi_multi_UBspline_3d_s_vgh_kernel
(float *pos, float3 drInv, const float *coefs,
float *vals[], float *grads[], float *hess[], uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad, *myhess;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].z;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].z;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].z;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
float c = base[k*strides.z];
v += abc[n+0] * c;
g0 += abc[n+1] * c;
g1 += abc[n+2] * c;
g2 += abc[n+3] * c;
h00 += abc[n+4] * c;
h01 += abc[n+5] * c;
h02 += abc[n+6] * c;
h11 += abc[n+7] * c;
h12 += abc[n+8] * c;
h22 += abc[n+9] * c;
n += 10;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
abc[3*thr+0] = g0;
abc[3*thr+1] = g1;
abc[3*thr+2] = g2;
__syncthreads();
for (int i=0; i<3; i++)
mygrad[(3*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr];
__syncthreads();
// Write first half of Hessians
abc[6*thr+0] = h00;
abc[6*thr+1] = h01;
abc[6*thr+2] = h02;
abc[6*thr+3] = h11;
abc[6*thr+4] = h12;
abc[6*thr+5] = h22;
__syncthreads();
for (int i=0; i<6; i++)
myhess[(6*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
extern "C" void
eval_multi_multi_UBspline_3d_s_cuda (const multi_UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, spline->stride);
}
void
test_multi_cuda2()
{
int numWalkers = 1000;
float *vals[numWalkers], *grads[numWalkers], *hess[numWalkers];
float *coefs, __device__ **vals_d, **grads_d, **hess_d;
float *r_d, *r_h;
int xs, ys, zs, N;
int Nx, Ny, Nz;
N = 128;
Nx = Ny = Nz = 32;
xs = Ny*Nz*N;
ys = Nz*N;
zs = N;
// Setup Bspline coefficients
int size = Nx*Ny*Nz*N*sizeof(float);
posix_memalign((void**)&coefs, 16, size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int n=0; n<N; n++)
coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
Ugrid x_grid, y_grid, z_grid;
x_grid.start = 0.0; x_grid.end = 1.0; x_grid.num = Nx;
y_grid.start = 0.0; y_grid.end = 1.0; y_grid.num = Ny;
z_grid.start = 0.0; z_grid.end = 1.0; z_grid.num = Nz;
BCtype_s xBC, yBC, zBC;
xBC.lCode = xBC.rCode = PERIODIC;
yBC.lCode = yBC.rCode = PERIODIC;
zBC.lCode = zBC.rCode = PERIODIC;
multi_UBspline_3d_s *spline =
create_multi_UBspline_3d_s (x_grid, y_grid, z_grid, xBC, yBC, zBC, N);
for (int i=0; i<N; i++)
set_multi_UBspline_3d_s (spline, i, coefs);
multi_UBspline_3d_s_cuda *cudaspline =
create_multi_UBspline_3d_s_cuda (spline);
// Setup device value storage
int numVals = N*numWalkers*10;
float *valBlock_d, *valBlock_h;
cudaMalloc((void**)&(valBlock_d), numVals*sizeof(float));
cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float));
cudaMalloc((void**)&(vals_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&(grads_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&(hess_d), numWalkers*sizeof(float*));
fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
for (int i=0; i<numWalkers; i++) {
vals[i] = valBlock_d + i*N;
grads[i] = valBlock_d + N*numWalkers + 3*i*N;
hess[i] = valBlock_d + 4*N*numWalkers + 6*i*N;
}
cudaMemcpy(vals_d, vals, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(grads_d, grads, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(hess_d, hess, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
fprintf (stderr, "Finished cuda allocations.\n");
// Setup walker positions
cudaMalloc((void**)&(r_d), 3*numWalkers*sizeof(float));
cudaMallocHost((void**)&(r_h), 3*numWalkers*sizeof(float));
for (int ir=0; ir<numWalkers; ir++) {
r_h[3*ir+0] = 0.5*drand48();
r_h[3*ir+1] = 0.5*drand48();
r_h[3*ir+2] = 0.5*drand48();
}
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(N/SPLINE_BLOCK_SIZE,numWalkers);
float vals_host[N], vals_cuda[N];
// Check value
for (int w=0; w<numWalkers; w++) {
eval_multi_UBspline_3d_s (spline, r_h[3*w+0], r_h[3*w+1], r_h[3*w+2], vals_host);
cudaMemcpy(r_d, r_h, 3*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
(r_d, cudaspline->gridInv, cudaspline->coefs, vals_d, cudaspline->stride);
cudaMemcpy(vals_cuda, valBlock_d+(N*w), N*sizeof(float), cudaMemcpyDeviceToHost);
//for (int i=0; i<N; i++)
fprintf (stderr, "%3i %15.8e %15.8e\n", w, vals_host[0], vals_cuda[0]);
}
clock_t start, end;
start = clock();
for (int i=0; i<10000; i++) {
if ((i%1000) == 0)
fprintf (stderr, "i = %d\n", i);
cudaMemcpy(r_d, r_h, 3*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
(r_d, cudaspline->gridInv, cudaspline->coefs, vals_d, cudaspline->stride);
}
end = clock();
double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
start = clock();
for (int i=0; i<10000; i++) {
if ((i%1000) == 0)
fprintf (stderr, "i = %d\n", i);
cudaMemcpy(r_d, r_h, 3*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
eval_multi_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>>
(r_d, cudaspline->gridInv, cudaspline->coefs, vals_d, grads_d, hess_d, cudaspline->stride);
}
end = clock();
time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
fprintf (stderr, "VGH Evals per second = %1.8e\n", 1.0/time);
cudaFree (spline->coefs);
cudaFree (valBlock_d);
cudaFree (vals_d);
cudaFree (grads_d);
cudaFree (hess_d);
cudaFree (r_d);
}
static void *
test_multi_cuda(void *thread)
{
cudaSetDevice((int)(size_t)thread);
fprintf (stderr, "In thread %p\n", thread);
int numWalkers = 1000;
float *coefs , __device__ *vals[numWalkers], *grads[numWalkers], *hess[numWalkers];
float *coefs_d, __device__ **vals_d, **grads_d, **hess_d;
float A_h[48] = { -1.0/6.0, 3.0/6.0, -3.0/6.0, 1.0/6.0,
3.0/6.0, -6.0/6.0, 0.0/6.0, 4.0/6.0,
-3.0/6.0, 3.0/6.0, 3.0/6.0, 1.0/6.0,
1.0/6.0, 0.0/6.0, 0.0/6.0, 0.0/6.0,
0.0, -0.5, 1.0, -0.5,
0.0, 1.5, -2.0, 0.0,
0.0, -1.5, 1.0, 0.5,
0.0, 0.5, 0.0, 0.0,
0.0, 0.0, -1.0, 1.0,
0.0, 0.0, 3.0, -2.0,
0.0, 0.0, -3.0, 1.0,
0.0, 0.0, 1.0, 0.0 };
// Copy A to host
cudaMemcpy(Acuda, A_h, 48*sizeof(float), cudaMemcpyHostToDevice);
float *r_d, *r_h;
int xs, ys, zs, N;
int Nx, Ny, Nz;
N = 128;
Nx = Ny = Nz = 32;
xs = Ny*Nz*N;
ys = Nz*N;
zs = N;
float3 drInv;
drInv.x = 1.0/float(Nx);
drInv.y = 1.0/float(Ny);
drInv.z = 1.0/float(Nz);
// Setup Bspline coefficients
int size = Nx*Ny*Nz*N*sizeof(float);
posix_memalign((void**)&coefs, 16, size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int n=0; n<N; n++)
coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
fprintf (stderr, "Filled in coefs.\n");
fprintf (stderr, "size = %d\n", size);
// Setup CUDA coefficients
cudaMalloc((void**)&coefs_d, 2*size);
cudaMemcpy(coefs_d, coefs, size, cudaMemcpyHostToDevice);
// Setup device value storage
int numVals = N*numWalkers*10;
float *valBlock_d, *valBlock_h;
cudaMalloc((void**)&(valBlock_d), numVals*sizeof(float));
cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float));
cudaMalloc((void**)&(vals_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&(grads_d), numWalkers*sizeof(float*));
cudaMalloc((void**)&(hess_d), numWalkers*sizeof(float*));
fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
for (int i=0; i<numWalkers; i++) {
vals[i] = valBlock_d + i*N;
grads[i] = valBlock_d + N*numWalkers + 3*i*N;
hess[i] = valBlock_d + 4*N*numWalkers + 6*i*N;
}
cudaMemcpy(vals_d, vals, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(grads_d, grads, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(hess_d, hess, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
fprintf (stderr, "Finished cuda allocations.\n");
// Setup walker positions
cudaMalloc((void**)&(r_d), 4*numWalkers*sizeof(float));
cudaMallocHost((void**)&(r_h), 4*numWalkers*sizeof(float));
for (int ir=0; ir<numWalkers; ir++) {
r_h[4*ir+0] = 0.5*drand48();
r_h[4*ir+1] = 0.5*drand48();
r_h[4*ir+2] = 0.5*drand48();
}
uint3 strides;
strides.x = xs;
strides.y = ys;
strides.z = zs;
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(N/SPLINE_BLOCK_SIZE,numWalkers);
clock_t start, end;
start = clock();
for (int i=0; i<10000; i++) {
if ((i%1000) == 0)
fprintf (stderr, "i = %d\n", i);
cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
(r_d, drInv, coefs_d, vals_d, strides);
}
end = clock();
double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
fprintf (stderr, "VGH evals per second = %1.8e\n", 1.0/time);
start = clock();
for (int i=0; i<10000; i++) {
if ((i%1000) == 0)
fprintf (stderr, "i = %d\n", i);
cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
eval_multi_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>>
(r_d, drInv, coefs_d, vals_d, grads_d, hess_d, strides);
}
end = clock();
time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
// cudaFree (valBlock_d);
// cudaFree (vals_d);
// cudaFree (coefs_d);
// cudaFree (r_d);
return NULL;
}
#ifndef NO_CUDA_MAIN
main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
fprintf (stderr, "Detected %d CUDA devices.\n", deviceCount);
// test_cuda();
for (int device = 0; device < deviceCount; ++device) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
fprintf (stderr, "Device %d:\n", device);
fprintf (stderr, " Global memory: %10d\n",
deviceProp.totalGlobalMem);
fprintf (stderr, " MultiProcessors: %10d\n",
deviceProp.multiProcessorCount);
fprintf (stderr, " Registers: %10d\n",
deviceProp.regsPerBlock);
fprintf (stderr, " Constant memory: %10d\n",
deviceProp.totalConstMem);
fprintf (stderr, " Shared memory: %10d\n",
deviceProp.sharedMemPerBlock);
fprintf (stderr, " Clock rate: %10d\n",
deviceProp.clockRate);
}
// test_multi_cuda((void*)0);
test_multi_cuda2();
fprintf (stderr, "After frees.\n");
}
#endif

View File

@ -0,0 +1,216 @@
#ifndef MULTI_BSPLINE_CUDA_S_H
#define MULTI_BSPLINE_CUDA_S_H
#include "multi_bspline_structs_cuda.h"
__global__ static void
eval_multi_multi_UBspline_3d_s_cuda (float *pos, float3 drInv,
const float *coefs, float *vals[], uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
__shared__ float3 r;
if (thr == 0) {
r.x = pos[4*ir+0];
r.y = pos[4*ir+1];
r.z = pos[4*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = A[4*thr+0]*tp[0].x + A[4*thr+1]*tp[0].y + A[4*thr+2]*tp[0].z + A[4*thr+3]*tp[0].w;
b[thr] = A[4*thr+0]*tp[1].x + A[4*thr+1]*tp[1].y + A[4*thr+2]*tp[1].z + A[4*thr+3]*tp[1].w;
c[thr] = A[4*thr+0]*tp[2].x + A[4*thr+1]*tp[2].y + A[4*thr+2]*tp[2].z + A[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
float val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = val;
}
__global__ static void
eval_multi_multi_UBspline_3d_s_vgh_cuda (float *pos, float3 drInv, const float *coefs,
float *vals[], float *grads[], float *hess[],
uint3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad, *myhess;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[4*ir+0];
r.y = pos[4*ir+1];
r.z = pos[4*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = A[4*thr+0]*tp[0].x + A[4*thr+1]*tp[0].y + A[4*thr+2]*tp[0].z + A[4*thr+3]*tp[0].z;
b[thr] = A[4*thr+0]*tp[1].x + A[4*thr+1]*tp[1].y + A[4*thr+2]*tp[1].z + A[4*thr+3]*tp[1].z;
c[thr] = A[4*thr+0]*tp[2].x + A[4*thr+1]*tp[2].y + A[4*thr+2]*tp[2].z + A[4*thr+3]*tp[2].z;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
float c = base[k*strides.z];
v += abc[n+0] * c;
g0 += abc[n+1] * c;
g1 += abc[n+2] * c;
g2 += abc[n+3] * c;
h00 += abc[n+4] * c;
h01 += abc[n+5] * c;
h02 += abc[n+6] * c;
h11 += abc[n+7] * c;
h12 += abc[n+8] * c;
h22 += abc[n+9] * c;
n += 10;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
abc[3*thr+0] = g0;
abc[3*thr+1] = g1;
abc[3*thr+2] = g2;
__syncthreads();
for (int i=0; i<3; i++)
mygrad[(3*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr];
__syncthreads();
// Write first half of Hessians
abc[6*thr+0] = h00;
abc[6*thr+1] = h01;
abc[6*thr+2] = h02;
abc[6*thr+3] = h11;
abc[6*thr+4] = h12;
abc[6*thr+5] = h22;
__syncthreads();
for (int i=0; i<6; i++)
myhess[(6*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
#endif

View File

@ -0,0 +1,938 @@
#ifndef MULTI_BSPLINE_CUDA_S_IMPL_H
#define MULTI_BSPLINE_CUDA_S_IMPL_H
//#include <stdio.h>
#include "multi_bspline.h"
#include "multi_bspline_create_cuda.h"
__global__ static void
eval_multi_multi_UBspline_1d_s_kernel
(float *pos, float drInv, const float *coefs, float **vals,
uint dim, uint stride, int N)
{
int tid = threadIdx.x;
int ir = blockIdx.x;
__shared__ float *ourval;
__shared__ float r;
if (tid == 0) {
r = pos[ir];
ourval = vals[ir];
}
__syncthreads();
int index;
float t;
float s, sf;
float4 tp;
s = r * drInv;
sf = floor(s);
index = min(max(0,(int)sf), dim-1);
t = s - sf;
tp = make_float4(t*t*t, t*t, t, 1.0);
__shared__ float a[4];
if (tid < 4)
a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
__syncthreads();
int numBlocks = N / SPLINE_BLOCK_SIZE;
const float *c = coefs + index*stride + tid;
float *myval = ourval + tid;
int stride2 = 2*stride;
int stride3 = 3*stride;
for (int block=0; block < numBlocks; block++) {
*myval = (a[0] * c[0] +
a[1] * c[stride] +
a[2] * c[stride2] +
a[3] * c[stride3]);
myval += SPLINE_BLOCK_SIZE; c += SPLINE_BLOCK_SIZE;
}
int remainder = N - numBlocks*SPLINE_BLOCK_SIZE;
if (tid < remainder) {
*myval = (a[0] * c[0] +
a[1] * c[stride] +
a[2] * c[stride2] +
a[3] * c[stride3]);
}
}
extern "C" void
eval_multi_multi_UBspline_1d_s_cuda (const multi_UBspline_1d_s_cuda *spline,
float *pos_d, float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(num);
eval_multi_multi_UBspline_1d_s_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_s_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_1d_s_vgl_kernel
(float *pos, float drInv, const float *coefs, float **vals,
float **grads, float **lapl,
uint dim, uint stride, int N)
{
int tid = threadIdx.x;
int ir = blockIdx.x;
__shared__ float *ourval, *ourgrad, *ourlapl;
__shared__ float r;
if (tid == 0) {
r = pos[ir];
ourval = vals[ir];
ourgrad = grads[ir];
ourlapl = lapl[ir];
}
__syncthreads();
int index;
float t;
float s, sf;
float4 tp;
s = r * drInv;
sf = floor(s);
index = min(max(0,(int)sf), dim-1);
t = s - sf;
tp = make_float4(t*t*t, t*t, t, 1.0);
__shared__ float a[12];
if (tid < 12)
a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
__syncthreads();
int numBlocks = N / SPLINE_BLOCK_SIZE;
const float *c = coefs + index*stride + tid;
float *myval = ourval + tid;
float *mygrad = ourgrad + tid;
float *mylapl = ourlapl + tid;
int stride2 = 2*stride;
int stride3 = 3*stride;
__shared__ float coef[SPLINE_BLOCK_SIZE][5];
for (int block=0; block < numBlocks; block++) {
coef[tid][0] = c[0];
coef[tid][1] = c[stride];
coef[tid][2] = c[stride2];
coef[tid][3] = c[stride3];
*myval = (a[0] * coef[tid][0] + a[1] * coef[tid][1] +
a[2] * coef[tid][2] + a[3] * coef[tid][3]);
*mygrad = (a[4] * coef[tid][0] + a[5] * coef[tid][1] +
a[6] * coef[tid][2] + a[7] * coef[tid][3]);
*mylapl = (a[8] * coef[tid][0] + a[9] * coef[tid][1] +
a[10] * coef[tid][2] + a[11]* coef[tid][3]);
myval += SPLINE_BLOCK_SIZE;
mygrad += SPLINE_BLOCK_SIZE;
mylapl += SPLINE_BLOCK_SIZE;
c += SPLINE_BLOCK_SIZE;
}
int remainder = N - numBlocks*SPLINE_BLOCK_SIZE;
if (tid < remainder) {
*myval = (a[0] * c[0] +
a[1] * c[stride] +
a[2] * c[stride2] +
a[3] * c[stride3]);
}
}
extern "C" void
eval_multi_multi_UBspline_1d_s_vgl_cuda (const multi_UBspline_1d_s_cuda *spline,
float *pos_d, float *vals_d[],
float *grads_d[], float *lapl_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(num);
eval_multi_multi_UBspline_1d_s_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, lapl_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_s_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_3d_s_kernel
(float *pos, float3 drInv, const float *coefs, float *vals[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
//index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
//index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
//index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
if (off < N) {
float val = 0.0;
for (unsigned i=0; i<4; i++) {
const float *base = coefs + (index.x+i)*strides.x + (index.y)*strides.y + index.z*strides.z + off;
for (unsigned j=0; j<4; j++) {
for (unsigned k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[k*strides.z];
base += strides.y;
}
}
myval[off] = val;
}
}
__global__ static void
eval_multi_multi_UBspline_3d_s_sign_kernel
(float *pos, float *sign, float3 drInv, const float *coefs, float *vals[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ float *myval;
__shared__ float abc[64];
__shared__ float mysign;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mysign = sign[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
//index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
//index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
//index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
__shared__ float a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
if (off < N) {
float val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = mysign*val;
}
}
__global__ static void
eval_multi_multi_UBspline_3d_s_vgh_kernel
(float *pos, float3 drInv, const float *coefs,
float *vals[], float *grads[], float *hess[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad, *myhess;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (unsigned i=0; i<4; i++) {
for (unsigned j=0; j<4; j++) {
const float *base = b0 + i*strides.x + j*strides.y;
float c0 = base[0*strides.z];
float c1 = base[1*strides.z];
float c2 = base[2*strides.z];
float c3 = base[3*strides.z];
v += abc[n+ 0]*c0 + abc[n+ 1]*c1 + abc[n+ 2]*c2 + abc[n+ 3]*c3;
g0 += abc[n+ 64]*c0 + abc[n+ 65]*c1 + abc[n+ 66]*c2 + abc[n+ 67]*c3;
g1 += abc[n+128]*c0 + abc[n+129]*c1 + abc[n+130]*c2 + abc[n+131]*c3;
g2 += abc[n+192]*c0 + abc[n+193]*c1 + abc[n+194]*c2 + abc[n+195]*c3;
h00 += abc[n+256]*c0 + abc[n+257]*c1 + abc[n+258]*c2 + abc[n+259]*c3;
h01 += abc[n+320]*c0 + abc[n+321]*c1 + abc[n+322]*c2 + abc[n+323]*c3;
h02 += abc[n+384]*c0 + abc[n+385]*c1 + abc[n+386]*c2 + abc[n+387]*c3;
h11 += abc[n+448]*c0 + abc[n+449]*c1 + abc[n+450]*c2 + abc[n+451]*c3;
h12 += abc[n+512]*c0 + abc[n+513]*c1 + abc[n+514]*c2 + abc[n+515]*c3;
h22 += abc[n+576]*c0 + abc[n+577]*c1 + abc[n+578]*c2 + abc[n+579]*c3;
n += 4;
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
abc[3*thr+0] = g0;
abc[3*thr+1] = g1;
abc[3*thr+2] = g2;
__syncthreads();
for (int i=0; i<3; i++) {
int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 3*N)
mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
__syncthreads();
// Write Hessians
abc[6*thr+0] = h00;
abc[6*thr+1] = h01;
abc[6*thr+2] = h02;
abc[6*thr+3] = h11;
abc[6*thr+4] = h12;
abc[6*thr+5] = h22;
__syncthreads();
for (int i=0; i<6; i++) {
int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 6*N)
myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
}
extern "C" void
eval_multi_multi_UBspline_3d_s_cuda (const multi_UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
extern "C" void
eval_multi_multi_UBspline_3d_s_sign_cuda (const multi_UBspline_3d_s_cuda *spline,
float *pos_d, float *sign_d,
float *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_s_sign_kernel<<<dimGrid,dimBlock>>>
(pos_d, sign_d, spline->gridInv, spline->coefs,
vals_d, spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
extern "C" void
eval_multi_multi_UBspline_3d_s_vgh_cuda (const multi_UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], float *grads_d[],
float *hess_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, hess_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_vgh_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_3d_s_vgl_kernel
(float *pos, float3 drInv, const float *coefs, float Linv[],
float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad_lapl;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad_lapl = grad_lapl[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const float *base = b0 + i*strides.x + j*strides.y;
float c0 = base[0*strides.z];
float c1 = base[1*strides.z];
float c2 = base[2*strides.z];
float c3 = base[3*strides.z];
v += abc[n+ 0]*c0 + abc[n+ 1]*c1 + abc[n+ 2]*c2 + abc[n+ 3]*c3;
g0 += abc[n+ 64]*c0 + abc[n+ 65]*c1 + abc[n+ 66]*c2 + abc[n+ 67]*c3;
g1 += abc[n+128]*c0 + abc[n+129]*c1 + abc[n+130]*c2 + abc[n+131]*c3;
g2 += abc[n+192]*c0 + abc[n+193]*c1 + abc[n+194]*c2 + abc[n+195]*c3;
h00 += abc[n+256]*c0 + abc[n+257]*c1 + abc[n+258]*c2 + abc[n+259]*c3;
h01 += abc[n+320]*c0 + abc[n+321]*c1 + abc[n+322]*c2 + abc[n+323]*c3;
h02 += abc[n+384]*c0 + abc[n+385]*c1 + abc[n+386]*c2 + abc[n+387]*c3;
h11 += abc[n+448]*c0 + abc[n+449]*c1 + abc[n+450]*c2 + abc[n+451]*c3;
h12 += abc[n+512]*c0 + abc[n+513]*c1 + abc[n+514]*c2 + abc[n+515]*c3;
h22 += abc[n+576]*c0 + abc[n+577]*c1 + abc[n+578]*c2 + abc[n+579]*c3;
n += 4;
}
}
// if (off < N) {
// for (int i=0; i<4; i++) {
// for (int j=0; j<4; j++) {
// float *base = b0 + i*strides.x + j*strides.y;
// for (int k=0; k<4; k++) {
// float c = base[k*strides.z];
// v += abc[n+ 0] * c;
// g0 += abc[n+ 64] * c;
// g1 += abc[n+128] * c;
// g2 += abc[n+192] * c;
// h00 += abc[n+256] * c;
// h01 += abc[n+320] * c;
// h02 += abc[n+384] * c;
// h11 += abc[n+448] * c;
// h12 += abc[n+512] * c;
// h22 += abc[n+576] * c;
// n += 1;
// }
// }
// }
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
__shared__ float G[3][3], GGt[3][3];
int i0 = threadIdx.x/3;
int i1 = threadIdx.x - 3*i0;
if (threadIdx.x < 9)
G[i0][i1] = Linv[threadIdx.x];
__syncthreads();
if (threadIdx.x < 9)
GGt[i0][i1] = (G[0][i0]*G[0][i1] +
G[1][i0]*G[1][i1] +
G[2][i0]*G[2][i1]);
__syncthreads();
if (off < N) {
// Store gradients back to global memory
mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
mygrad_lapl[off+1*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
mygrad_lapl[off+2*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
// Store laplacians back to global memory
// Hessian = H00 H01 H02 H11 H12 H22
// Matrix = [0 1 2]
// [1 3 4]
// [2 4 5]
// laplacian = Trace(GGt*Hessian)
mygrad_lapl[off+3*row_stride] =
(GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
}
}
extern "C" void
eval_multi_multi_UBspline_3d_s_vgl_cuda
(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_s_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_vgl_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_3d_s_vgl_sign_kernel
(float *pos, float sign[], float3 drInv, const float *coefs, float Linv[],
float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ float *myval, *mygrad_lapl, mysign;
__shared__ float3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad_lapl = grad_lapl[ir];
mysign = sign[ir];
}
__syncthreads();
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ float a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ float abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
float v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const float *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
float c = base[k*strides.z];
v += abc[n+ 0] * c;
g0 += abc[n+ 64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ float buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = mysign * v;
}
__shared__ float G[3][3], GGt[3][3];
int i0 = threadIdx.x/3;
int i1 = threadIdx.x - 3*i0;
if (threadIdx.x < 9)
G[i0][i1] = Linv[threadIdx.x];
__syncthreads();
if (threadIdx.x < 9)
GGt[i0][i1] = (G[0][i0]*G[0][i1] +
G[1][i0]*G[1][i1] +
G[2][i0]*G[2][i1]);
__syncthreads();
if (off < N) {
// Store gradients back to global memory
mygrad_lapl[off+0*row_stride] = mysign*(G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2);
mygrad_lapl[off+1*row_stride] = mysign*(G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2);
mygrad_lapl[off+2*row_stride] = mysign*(G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2);
// Store laplacians back to global memory
// Hessian = H00 H01 H02 H11 H12 H22
// Matrix = [0 1 2]
// [1 3 4]
// [2 4 5]
// laplacian = Trace(GGt*Hessian)
mygrad_lapl[off+3*row_stride] = mysign *
(GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
}
}
extern "C" void
eval_multi_multi_UBspline_3d_s_vgl_sign_cuda
(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_s_vgl_sign_kernel<<<dimGrid,dimBlock>>>
(pos_d, sign_d, spline->gridInv, spline->coefs, Linv_d, vals_d,
grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_vgl_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
#endif

View File

@ -0,0 +1,461 @@
#ifndef MULTI_BSPLINE_CUDA_Z_IMPL_H
#define MULTI_BSPLINE_CUDA_Z_IMPL_H
#include "multi_bspline.h"
#include "multi_bspline_create_cuda.h"
__global__ static void
eval_multi_multi_UBspline_3d_z_kernel
(double *pos, double3 drInv, const double *coefs, double *vals[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+thr;
__shared__ double *myval;
__shared__ double abc[64];
__shared__ double3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
}
__syncthreads();
int3 index;
double3 t;
double s, sf;
double4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
//index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
//index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
//index.z = (int)sf;
t.z = s - sf;
tp[0].x=t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
tp[1].x=t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
tp[2].x=t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
__shared__ double a[4], b[4], c[4];
if (thr < 4) {
a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
}
__syncthreads();
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
if (thr < 64)
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
if (off < 2*N) {
double val = 0.0;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const double *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++)
val += abc[16*i+4*j+k] * base[off+k*strides.z];
}
}
myval[off] = val;
}
}
__global__ static void
eval_multi_multi_UBspline_3d_z_vgh_kernel
(double *pos, double3 drInv, const double *coefs,
double *vals[], double *grads[], double *hess[],
uint3 dim, uint3 strides, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad, *myhess;
__shared__ double3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad = grads[ir];
myhess = hess[ir];
}
__syncthreads();
int3 index;
double3 t;
double s, sf;
double4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0].x=t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
tp[1].x=t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
tp[2].x=t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ double a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ double abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
double v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < 2*N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const double *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
double c = base[k*strides.z];
v += abc[n+0] * c;
g0 += abc[n+64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ double buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
abc[3*thr+0] = g0;
abc[3*thr+1] = g1;
abc[3*thr+2] = g2;
__syncthreads();
for (int i=0; i<3; i++) {
int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 3*N)
mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
__syncthreads();
// Write Hessians
abc[6*thr+0] = h00;
abc[6*thr+1] = h01;
abc[6*thr+2] = h02;
abc[6*thr+3] = h11;
abc[6*thr+4] = h12;
abc[6*thr+5] = h22;
__syncthreads();
for (int i=0; i<6; i++) {
int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
if (myoff < 12*N)
myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
}
}
extern "C" void
eval_multi_multi_UBspline_3d_z_cuda (const multi_UBspline_3d_z_cuda *spline,
double *pos_d, double *vals_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_z_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (double*)spline->coefs, (double**)vals_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_z_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
extern "C" void
eval_multi_multi_UBspline_3d_z_vgh_cuda (const multi_UBspline_3d_z_cuda *spline,
double *pos_d, complex_double *vals_d[], complex_double *grads_d[],
complex_double *hess_d[], int num)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_z_vgh_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (double*)spline->coefs,
(double**)vals_d, (double**)grads_d, (double**)hess_d,
spline->dim, spline->stride, spline->num_splines);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_z_vgh_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
__global__ static void
eval_multi_multi_UBspline_3d_z_vgl_kernel
(double *pos, double3 drInv, const double *coefs, double Linv[],
double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
int N, int row_stride)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int off = block*SPLINE_BLOCK_SIZE+threadIdx.x;
__shared__ double *myval, *mygrad_lapl;
__shared__ double3 r;
if (thr == 0) {
r.x = pos[3*ir+0];
r.y = pos[3*ir+1];
r.z = pos[3*ir+2];
myval = vals[ir];
mygrad_lapl = grad_lapl[ir];
}
__syncthreads();
int3 index;
double3 t;
double s, sf;
double4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = min(max(0,(int)sf), dim.x-1);
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = min(max(0,(int)sf), dim.y-1);
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = min(max(0,(int)sf), dim.z-1);
t.z = s - sf;
tp[0].x=t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
tp[1].x=t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
tp[2].x=t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
// First 4 of a are value, second 4 are derivative, last four are
// second derivative.
__shared__ double a[12], b[12], c[12];
if (thr < 12) {
a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
}
__syncthreads();
__shared__ double abc[640];
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
abc[(16*i+4*j+k)+64] = a[i+4]*b[j+0]*c[k+0]; // d/dx
abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
__syncthreads();
double v = 0.0, g0=0.0, g1=0.0, g2=0.0,
h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
int n = 0;
const double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
if (off < 2*N) {
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
const double *base = b0 + i*strides.x + j*strides.y;
for (int k=0; k<4; k++) {
double c = base[k*strides.z];
v += abc[n+ 0] * c;
g0 += abc[n+ 64] * c;
g1 += abc[n+128] * c;
g2 += abc[n+192] * c;
h00 += abc[n+256] * c;
h01 += abc[n+320] * c;
h02 += abc[n+384] * c;
h11 += abc[n+448] * c;
h12 += abc[n+512] * c;
h22 += abc[n+576] * c;
n += 1;
}
}
}
g0 *= drInv.x;
g1 *= drInv.y;
g2 *= drInv.z;
h00 *= drInv.x * drInv.x;
h01 *= drInv.x * drInv.y;
h02 *= drInv.x * drInv.z;
h11 *= drInv.y * drInv.y;
h12 *= drInv.y * drInv.z;
h22 *= drInv.z * drInv.z;
// __shared__ double buff[6*SPLINE_BLOCK_SIZE];
// Note, we can reuse abc, by replacing buff with abc.
myval[off] = v;
}
__shared__ double G[3][3], GGt[3][3];
int i0 = threadIdx.x/3;
int i1 = threadIdx.x - 3*i0;
if (threadIdx.x < 9)
G[i0][i1] = Linv[threadIdx.x];
__syncthreads();
if (threadIdx.x < 9)
GGt[i0][i1] = (G[0][i0]*G[0][i1] +
G[1][i0]*G[1][i1] +
G[2][i0]*G[2][i1]);
__syncthreads();
if (off < 2*N) {
// Store gradients back to global memory
mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
mygrad_lapl[off+2*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
mygrad_lapl[off+4*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
// Store laplacians back to global memory
// Hessian = H00 H01 H02 H11 H12 H22
// Matrix = [0 1 2]
// [1 3 4]
// [2 4 5]
// laplacian = Trace(GGt*Hessian)
mygrad_lapl[off+6*row_stride] =
(GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
}
}
extern "C" void
eval_multi_multi_UBspline_3d_z_vgl_cuda
(const multi_UBspline_3d_z_cuda *spline, double *pos_d, double *Linv_d,
double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
{
dim3 dimBlock(SPLINE_BLOCK_SIZE);
dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
dimGrid.x++;
eval_multi_multi_UBspline_3d_z_vgl_kernel<<<dimGrid,dimBlock>>>
(pos_d, spline->gridInv, (double*)spline->coefs, Linv_d, (double**)vals_d,
(double**)grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_z_vgl_cuda:\n %s\n",
cudaGetErrorString(err));
abort();
}
}
#endif

View File

@ -0,0 +1,115 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef MULTI_BSPLINE_EVAL_C_H
#define MULTI_BSPLINE_EVAL_C_H
#include <math.h>
#include <stdio.h>
#include "multi_bspline_structs.h"
/************************************************************/
/* 1D float-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_1d_c (const multi_UBspline_1d_c *spline,
double x,
complex_float* restrict vals);
void
eval_multi_UBspline_1d_c_vg (const multi_UBspline_1d_c *spline,
double x,
complex_float* restrict vals,
complex_float* restrict grads);
void
eval_multi_UBspline_1d_c_vgl (const multi_UBspline_1d_c *spline,
double x,
complex_float* restrict vals,
complex_float* restrict grads,
complex_float* restrict lapl);
void
eval_multi_UBspline_1d_c_vgh (const multi_UBspline_1d_c *spline,
double x,
complex_float* restrict vals,
complex_float* restrict grads,
complex_float* restrict hess);
/************************************************************/
/* 2D float-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_2d_c (const multi_UBspline_2d_c *spline,
double x, double y,
complex_float* restrict vals);
void
eval_multi_UBspline_2d_c_vg (const multi_UBspline_2d_c *spline,
double x, double y,
complex_float* restrict vals,
complex_float* restrict grads);
void
eval_multi_UBspline_2d_c_vgl (const multi_UBspline_2d_c *spline,
double x, double y,
complex_float* restrict vals,
complex_float* restrict grads,
complex_float* restrict lapl);
void
eval_multi_UBspline_2d_c_vgh (const multi_UBspline_2d_c *spline,
double x, double y,
complex_float* restrict vals,
complex_float* restrict grads,
complex_float* restrict hess);
/************************************************************/
/* 3D float-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_3d_c (const multi_UBspline_3d_c *spline,
double x, double y, double z,
complex_float* restrict vals);
void
eval_multi_UBspline_3d_c_vg (const multi_UBspline_3d_c *spline,
double x, double y, double z,
complex_float* restrict vals,
complex_float* restrict grads);
void
eval_multi_UBspline_3d_c_vgl (const multi_UBspline_3d_c *spline,
double x, double y, double z,
complex_float* restrict vals,
complex_float* restrict grads,
complex_float* restrict lapl);
void
eval_multi_UBspline_3d_c_vgh (const multi_UBspline_3d_c *spline,
double x, double y, double z,
complex_float* restrict vals,
complex_float* restrict grads,
complex_float* restrict hess);
#endif

View File

@ -0,0 +1,140 @@
#ifndef MULTI_BSPLINE_EVAL_CUDA_H
#define MULTI_BSPLINE_EVAL_CUDA_H
#include "multi_bspline_structs_cuda.h"
////////
// 1D //
////////
// Single-precision real
extern "C" void
eval_multi_multi_UBspline_1d_s_cuda
(const multi_UBspline_1d_s_cuda *spline, float *pos_d, float *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_1d_s_vgl_cuda
(const multi_UBspline_1d_s_cuda *spline, float *pos_d,
float *vals_d[], float *grads_d[], float *lapl_d[], int num);
// Double-precision real
extern "C" void
eval_multi_multi_UBspline_1d_d_cuda
(const multi_UBspline_1d_d_cuda *spline, double *pos_d, double *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_1d_d_vgl_cuda
(const multi_UBspline_1d_d_cuda *spline, double *pos_d,
double *vals_d[], double *grad_lapl_d[], int num, int row_stride);
// Single-precision complex
extern "C" void
eval_multi_multi_UBspline_1d_c_cuda
(const multi_UBspline_1d_c_cuda *spline,
float *pos_d, complex_float *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_1d_c_vgl_cuda
(const multi_UBspline_1d_c_cuda *spline, float *pos_d,
complex_float *vals_d[], complex_float *grad_lapl_d[], int num, int row_stride);
// Doublele-precision complex
extern "C" void
eval_multi_multi_UBspline_1d_z_cuda
(const multi_UBspline_1d_z_cuda *spline,
double *pos_d, complex_double *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_1d_z_vgl_cuda
(const multi_UBspline_1d_z_cuda *spline, double *pos_d,
complex_double *vals_d[], complex_double *grad_lapl_d[], int num, int row_stride);
////////
// 3D //
////////
// Single-precision real
extern "C" void
eval_multi_multi_UBspline_3d_s_cuda
(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_s_sign_cuda
(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d,
float *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_s_vgh_cuda
(const multi_UBspline_3d_s_cuda *spline,
float *pos_d, float *vals_d[], float *grads_d[], float *hess_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_s_vgl_cuda
(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
extern "C" void
eval_multi_multi_UBspline_3d_s_vgl_sign_cuda
(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d,
float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
// Double-precision real
extern "C" void
eval_multi_multi_UBspline_3d_d_cuda
(const multi_UBspline_3d_d_cuda *spline, double *pos_d, double *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_d_vgh_cuda
(const multi_UBspline_3d_d_cuda *spline,
double *pos_d, double *vals_d[], double *grads_d[], double *hess_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_d_vgl_cuda
(const multi_UBspline_3d_d_cuda *spline, double *pos_d, double *Linv_d,
double *vals_d[], double *grad_lapl_d[], int num, int row_stride);
// Single-precision complex
extern "C" void
eval_multi_multi_UBspline_3d_c_cuda
(const multi_UBspline_3d_c_cuda *spline,
float *pos_d, complex_float *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_c_vgh_cuda
(const multi_UBspline_3d_c_cuda *spline, float *pos_d,
complex_float *vals_d[], complex_float *grads_d[],
complex_float *hess_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_c_vgl_cuda
(const multi_UBspline_3d_c_cuda *spline, float *pos_d, float *Linv_d,
complex_float *vals_d[], complex_float *grad_lapl_d[], int num, int row_stride);
// Doublele-precision complex
extern "C" void
eval_multi_multi_UBspline_3d_z_cuda
(const multi_UBspline_3d_z_cuda *spline,
double *pos_d, complex_double *vals_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_z_vgh_cuda
(const multi_UBspline_3d_z_cuda *spline, double *pos_d,
complex_double *vals_d[], complex_double *grads_d[],
complex_double *hess_d[], int num);
extern "C" void
eval_multi_multi_UBspline_3d_z_vgl_cuda
(const multi_UBspline_3d_z_cuda *spline, double *pos_d, double *Linv_d,
complex_double *vals_d[], complex_double *grad_lapl_d[], int num, int row_stride);
#endif

View File

@ -0,0 +1,459 @@
#define BLOCK_SIZE 64
#include <stdio.h>
#include <pthread.h>
#include <cuda.h>
#include <cutil.h>
#include <multithreading.h>
__global__ void
eval_multi_UBspline_3d_cuda_c (const float *coefs, float *abc, float *vals,
int ix, int iy, int iz,
int xs, int ys, int zs, int N)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int offset = block*BLOCK_SIZE+thr;
__shared__ float abcs[64];
abcs[thr] = abc[thr];
__syncthreads();
float val= 0.0;
//int index=0;
for (int i=0; i<4; i++)
for (int j=0; j<4; j++) {
for (int k=0; k<4; k++) {
float *base_addr = coefs + (ix+i)*xs + (iy+j)*ys + (iz+k)*zs;
//val += abc[(16*i+4*j+k)*BLOCK_SIZE + thr] * base_addr[offset];
val += abcs[16*i+4*j+k] * base_addr[offset];
//index++;
}
}
vals[offset] = val;
}
__constant__ float A[16], dA[16], d2A[16];
__global__ static void
eval_multi_multi_UBspline_3d_cuda_c (float *pos, float3 drInv,
const float *coefs_real,const float *coefs_imag,
float *vals_real, float *vals_imag,
int3 strides)
{
int block = blockIdx.x;
int thr = threadIdx.x;
int ir = blockIdx.y;
int offset = block*BLOCK_SIZE+thr;
__shared__ float abc[64];
__shared__ float pos_s[BLOCK_SIZE];
int ir1 = (ir >> 4)*64;
int ir2 = (ir & 15)*4;
pos_s[thr] = pos[ir1+thr];
__syncthreads();
float3 r;
r.x = pos_s[ir2+0];
r.y = pos_s[ir2+1];
r.z = pos_s[ir2+2];
int3 index;
float3 t;
float s, sf;
float4 tp[3];
s = r.x * drInv.x;
sf = floor(s);
index.x = (int)sf;
t.x = s - sf;
s = r.y * drInv.y;
sf = floor(s);
index.y = (int)sf;
t.y = s - sf;
s = r.z * drInv.z;
sf = floor(s);
index.z = (int)sf;
t.z = s - sf;
tp[0] = make_float4(1.0, t.x, t.x*t.x, t.x*t.x*t.x);
tp[1] = make_float4(1.0, t.y, t.y*t.y, t.y*t.y*t.y);
tp[2] = make_float4(1.0, t.z, t.z*t.z, t.z*t.z*t.z);
__shared__ float a[4], b[4], c[4];
if (thr == 0) {
a[0] = A[ 0]*tp[0].x + A[ 1]*tp[0].y + A[ 2]*tp[0].z + A[ 3]*tp[0].w;
a[1] = A[ 4]*tp[0].x + A[ 5]*tp[0].y + A[ 6]*tp[0].z + A[ 7]*tp[0].w;
a[2] = A[ 8]*tp[0].x + A[ 9]*tp[0].y + A[10]*tp[0].z + A[11]*tp[0].w;
a[3] = A[12]*tp[0].x + A[13]*tp[0].y + A[14]*tp[0].z + A[15]*tp[0].w;
b[0] = A[ 0]*tp[1].x + A[ 1]*tp[1].y + A[ 2]*tp[1].z + A[ 3]*tp[1].w;
b[1] = A[ 4]*tp[1].x + A[ 5]*tp[1].y + A[ 6]*tp[1].z + A[ 7]*tp[1].w;
b[2] = A[ 8]*tp[1].x + A[ 9]*tp[1].y + A[10]*tp[1].z + A[11]*tp[1].w;
b[3] = A[12]*tp[1].x + A[13]*tp[1].y + A[14]*tp[1].z + A[15]*tp[1].w;
c[0] = A[ 0]*tp[2].x + A[ 1]*tp[2].y + A[ 2]*tp[2].z + A[ 3]*tp[2].w;
c[1] = A[ 4]*tp[2].x + A[ 5]*tp[2].y + A[ 6]*tp[2].z + A[ 7]*tp[2].w;
c[2] = A[ 8]*tp[2].x + A[ 9]*tp[2].y + A[10]*tp[2].z + A[11]*tp[2].w;
c[3] = A[12]*tp[2].x + A[13]*tp[2].y + A[14]*tp[2].z + A[15]*tp[2].w;
}
int i = (thr>>4)&3;
int j = (thr>>2)&3;
int k = (thr & 3);
abc[thr] = a[i]*b[j]*c[k];
__syncthreads();
float val_real = 0.0;
float val_imag = 0.0;
//int index=0;
val_real = val_imag = 0.0;
// int di = strides.x - 4*strides.y;
// int dj = strides.y - 4*strides.z;
for (int i=0; i<4; i++) {
for (int j=0; j<4; j++) {
float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
for (int k=0; k<4; k++) {
// float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + (index.z+k)*strides.z;
// float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + (index.z+k)*strides.z;
val_real += abc[16*i+4*j+k] * base_real[offset+k*strides.z];
val_imag += abc[16*i+4*j+k] * base_imag[offset+k*strides.z];
// base_real += strides.z;
// base_imag += strides.z;
}
// base_real += dj;
// base_imag += dj;
}
// base_real += di;
// base_imag += di;
}
vals_real[offset+ir*128] = val_real;
vals_imag[offset+ir*128] = val_imag;
//vals_real[ir][offset] = val_real;
// vals_imag[ir][offset] = val_imag;
}
// __global__ void
// eval_multi_UBspline_3d_cuda_c2 (float3 r,
// float *coefs, float *vals,
// int xs, int ys, int zs, int N)
// {
// int block = blockIdx.x;
// int thr = threadIdx.x;
// __shared__ float abcs[64];
// abcs[thr] = abc[thr];
// float dxInv = 0.0625f;
// float v, dv;
// v = floor(dxInv*r.x);
// dv = dxInv*r.x - v;
// int ix = (int) v;
// v = floor(dxInv*r.x);
// dv = dxInv*r.x - v;
// int iy = (int) v;
// v = floor(dxInv*r.y);
// dv = dxInv*r.y - v;
// int iz = (int) v;
// int offset = block*BLOCK_SIZE+thr;
// __shared__ float abcs[64];
// abcs[thr] = abc[thr];
// float val= 0.0;
// //int index=0;
// val = 0.0;
// for (int i=0; i<4; i++)
// for (int j=0; j<4; j++)
// for (int k=0; k<4; k++) {
// float *base_addr = coefs + (ix+i)*xs + (iy+j)*ys + (iz+k)*zs;
// //val += abc[(16*i+4*j+k)*BLOCK_SIZE + thr] * base_addr[offset];
// val += abcs[16*i+4*j+k] * base_addr[offset];
// //index++;
// }
// vals[offset] = val;
// }
void
test_cuda()
{
float *coefs , *abc , *abc2, *vals;
float *coefs_d, *abc_d, *vals_d;
int xs, ys, zs, N;
int Nx, Ny, Nz;
N = 4096;
Nx = Ny = Nz = 16;
xs = Nx*Ny*Nz;
ys = Ny*Nz;
zs = Nz;
int size = Nx*Ny*Nz*N*sizeof(float);
posix_memalign((void**)&coefs, 16, size);
cudaMalloc((void**)&coefs_d, size);
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int n=0; n<N; n++)
coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
cudaMemcpy(coefs_d, coefs, size, cudaMemcpyHostToDevice);
posix_memalign ((void**)&abc, 16, 64*sizeof(float));
posix_memalign ((void**)&abc2, 16, 64*BLOCK_SIZE*sizeof(float));
cudaMalloc((void**)&abc_d, 64*BLOCK_SIZE*sizeof(float));
for (int i=0; i<64; i++) {
abc[i] = drand48();
for (int j=0; j<BLOCK_SIZE; j++)
abc2[i*BLOCK_SIZE+j] = abc[i];
}
// cudaMemcpy(abc_d, abc2, 64*BLOCK_SIZE*sizeof(float),
// cudaMemcpyHostToDevice);
cudaMemcpy(abc_d, abc, 64*sizeof(float),
cudaMemcpyHostToDevice);
posix_memalign((void**)&vals, 16, N*sizeof(float));
cudaMalloc((void**)&vals_d, N*sizeof(float));
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid(N/BLOCK_SIZE);
int ix=1;
int iy=2;
int iz=3;
clock_t start, end;
start = clock();
for (int i=0; i<100000; i++) {
eval_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>>
(coefs_d, abc_d, vals_d, ix, iy, iz, xs, ys, zs, N);
}
end = clock();
double time = (double)(end-start)/(double)(CLOCKS_PER_SEC*100000*N);
fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
cudaMemcpy (vals, vals_d, N*sizeof(float), cudaMemcpyDeviceToHost);
float vals2[N];
for (int n=0; n<N; n++) {
vals2[n] = 0.0;
int index=0;
for(int i=0; i<4; i++)
for (int j=0; j<4; j++)
for (int k=0; k<4; k++) {
vals2[n] += abc[index] * coefs[(ix+i)*xs+(iy+j)*ys+(iz+k)*zs+n];
index++;
}
}
for (int i=0; i<N/256; i++)
fprintf (stderr, "%1.9f %1.9f\n", vals[i], vals2[i]);
cudaFree(abc_d);
cudaFree(coefs_d);
cudaFree(vals_d);
}
static void *
test_multi_cuda(void *thread)
{
// CUcontext ctx;
// CUdevice dev;
// cuDeviceGet (&dev, (int)(size_t)thread);
// cuCtxCreate(&ctx, CU_CTX_SCHED_YIELD, dev);
// int deviceCount;
// cudaGetDeviceCount(&deviceCount);
CUDA_SAFE_CALL(cudaSetDevice((int)(size_t)thread));
fprintf (stderr, "In thread %p\n", thread);
int numWalkers = 2000;
float *coefs , __device__ *vals_real[numWalkers], __device__ *vals_imag[numWalkers];
float *coefs_real_d, *coefs_imag_d, __device__ *vals_real_d[numWalkers], __device__ *vals_imag_d[numWalkers];
float *r_d, *r_h;
int xs, ys, zs, N;
int Nx, Ny, Nz;
N = 128;
Nx = Ny = Nz = 64;
xs = Ny*Nz*N;
ys = Nz*N;
zs = N;
float3 drInv;
drInv.x = 1.0/float(Nx);
drInv.y = 1.0/float(Ny);
drInv.z = 1.0/float(Nz);
// Setup Bspline coefficients
int size = Nx*Ny*Nz*N*sizeof(float);
CUT_SAFE_MALLOC(posix_memalign((void**)&coefs, 16, size));
for (int ix=0; ix<Nx; ix++)
for (int iy=0; iy<Ny; iy++)
for (int iz=0; iz<Nz; iz++)
for (int n=0; n<N; n++)
coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
fprintf (stderr, "Filled in coefs.\n");
// Setup values
//posix_memalign((void**)&vals, 16, N*sizeof(float));
// cudaMemcpy(r_d, r, numWalkers*sizeof(float3), cudaMemcpyHostToDevice);
fprintf (stderr, "size = %d\n", size);
// Setup CUDA coefficients
fprintf (stderr, "Before first CUDA mallocs.\n");
CUDA_SAFE_CALL(cudaMalloc((void**)&coefs_real_d, size));
CUDA_SAFE_CALL(cudaMalloc((void**)&coefs_imag_d, size));
fprintf (stderr, "Before Memcpy.\n");
CUDA_SAFE_CALL(cudaMemcpy(coefs_real_d, coefs, size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(coefs_imag_d, coefs, size, cudaMemcpyHostToDevice));
fprintf (stderr, "After Memcpy.\n");
// Setup device value storage
int numVals = 2*N*numWalkers;
float *valBlock_d, *valBlock_h;
CUDA_SAFE_CALL(cudaMalloc((void**)&(valBlock_d), numVals*sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&(vals_real_d), numWalkers*sizeof(float*)));
CUDA_SAFE_CALL(cudaMalloc((void**)&(vals_imag_d), numWalkers*sizeof(float*)));
fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
for (int i=0; i<numWalkers; i++) {
vals_real[i] = valBlock_d + 2*i*N;
vals_imag[i] = valBlock_d + (2*i+1)*N;
}
CUDA_SAFE_CALL(cudaMemcpy(vals_real_d, vals_real, numWalkers*sizeof(float*), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(vals_imag_d, vals_imag, numWalkers*sizeof(float*), cudaMemcpyHostToDevice));
fprintf (stderr, "Finished cuda allocations.\n");
// Setup walker positions
CUDA_SAFE_CALL(cudaMalloc((void**)&(r_d), 4*numWalkers*sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&(r_h), 4*numWalkers*sizeof(float)));
for (int ir=0; ir<numWalkers; ir++) {
r_h[4*ir+0] = 0.75*drand48();
r_h[4*ir+1] = 0.75*drand48();
r_h[4*ir+2] = 0.75*drand48();
}
int3 strides;
strides.x = xs;
strides.y = ys;
strides.z = zs;
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid(N/BLOCK_SIZE,numWalkers);
clock_t start, end;
start = clock();
for (int i=0; i<10000; i++) {
if ((i%1000) == 0)
fprintf (stderr, "i = %d\n", i);
CUDA_SAFE_CALL(cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice));
// eval_multi_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>>
// (r_d, drInv, coefs_real_d, coefs_imag_d,
// vals_real_d, vals_imag_d, strides);
eval_multi_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>>
(r_d, drInv, coefs_real_d, coefs_imag_d,
valBlock_d, valBlock_d+numVals/2, strides);
//cudaMemcpy(valBlock_h, valBlock_d, numVals*sizeof(float), cudaMemcpyDeviceToHost);
}
end = clock();
double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
cudaFree (valBlock_d);
cudaFree (vals_real_d);
cudaFree (vals_imag_d);
cudaFree (coefs_real_d);
cudaFree (coefs_imag_d);
cudaFree (r_d);
return NULL;
// cudaMemcpy (vals, vals_d, N*sizeof(float), cudaMemcpyDeviceToHost);
// float vals2[N];
// for (int n=0; n<N; n++) {
// vals2[n] = 0.0;
// int index=0;
// for(int i=0; i<4; i++)
// for (int j=0; j<4; j++)
// for (int k=0; k<4; k++) {
// vals2[n] += abc[index] * coefs[(ix+i)*xs+(iy+j)*ys+(iz+k)*zs+n];
// index++;
// }
// }
// for (int i=0; i<N/256; i++)
// fprintf (stderr, "%1.9f %1.9f\n", vals[i], vals2[i]);
// cudaFree(abc_d);
// cudaFree(coefs_d);
// cudaFree(vals_d);
}
main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
fprintf (stderr, "Detected %d CUDA devices.\n", deviceCount);
// test_cuda();
for (int device = 0; device < deviceCount; ++device) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
fprintf (stderr, "Device %d:\n", device);
fprintf (stderr, " Global memory: %10d\n",
deviceProp.totalGlobalMem);
fprintf (stderr, " MultiProcessors: %10d\n",
deviceProp.multiProcessorCount);
fprintf (stderr, " Registers: %10d\n",
deviceProp.regsPerBlock);
fprintf (stderr, " Constant memory: %10d\n",
deviceProp.totalConstMem);
}
// pthread_t threads[deviceCount];
// for (int device = 0; device < deviceCount; device++)
// pthread_create (&(threads[device]), NULL, test_multi_cuda, (void*)device);
// cutStartThread((CUT_THREADROUTINE)test_multi_cuda,(void*)device);
test_multi_cuda((void*)0);
// pthread_exit(NULL);
//test_multi_cuda();
}

View File

@ -0,0 +1,120 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef MULTI_BSPLINE_EVAL_D_H
#define MULTI_BSPLINE_EVAL_D_H
#include <math.h>
#include <stdio.h>
#include "multi_bspline_structs.h"
/************************************************************/
/* 1D double-precision, real evaulation functions */
/************************************************************/
void
eval_multi_UBspline_1d_d (const multi_UBspline_1d_d *spline,
double x,
double* restrict vals);
void
eval_multi_UBspline_1d_d_vg (const multi_UBspline_1d_d *spline,
double x,
double* restrict vals,
double* restrict grads);
void
eval_multi_UBspline_1d_d_vgl (const multi_UBspline_1d_d *spline,
double x,
double* restrict vals,
double* restrict grads,
double* restrict lapl);
void
eval_multi_UBspline_1d_d_vgh (const multi_UBspline_1d_d *spline,
double x,
double* restrict vals,
double* restrict grads,
double* restrict hess);
/************************************************************/
/* 2D double-precision, real evaulation functions */
/************************************************************/
void
eval_multi_UBspline_2d_d (const multi_UBspline_2d_d *spline,
double x, double y,
double* restrict vals);
void
eval_multi_UBspline_2d_d_vg (const multi_UBspline_2d_d *spline,
double x, double y,
double* restrict vals,
double* restrict grads);
void
eval_multi_UBspline_2d_d_vgl (const multi_UBspline_2d_d *spline,
double x, double y,
double* restrict vals,
double* restrict grads,
double* restrict lapl);
void
eval_multi_UBspline_2d_d_vgh (const multi_UBspline_2d_d *spline,
double x, double y,
double* restrict vals,
double* restrict grads,
double* restrict hess);
/************************************************************/
/* 3D double-precision, real evaulation functions */
/************************************************************/
void
eval_multi_UBspline_3d_d (const multi_UBspline_3d_d *spline,
double x, double y, double z,
double* restrict vals);
void
eval_multi_UBspline_3d_d_vg (const multi_UBspline_3d_d *spline,
double x, double y, double z,
double* restrict vals,
double* restrict grads);
void
eval_multi_UBspline_3d_d_vgl (const multi_UBspline_3d_d *spline,
double x, double y, double z,
double* restrict vals,
double* restrict grads,
double* restrict lapl);
void
eval_multi_UBspline_3d_d_vgh (const multi_UBspline_3d_d *spline,
double x, double y, double z,
double* restrict vals,
double* restrict grads,
double* restrict hess);
void
eval_multi_UBspline_3d_d_vghgh (const multi_UBspline_3d_d *spline,
double x, double y, double z,
double* restrict vals,
double* restrict grads,
double* restrict hess,
double* restrict gradhess);
#endif

View File

@ -0,0 +1,110 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef MULTI_BSPLINE_EVAL_S_H
#define MULTI_BSPLINE_EVAL_S_H
#include "multi_bspline_structs.h"
/************************************************************/
/* 1D single-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_1d_s (const multi_UBspline_1d_s *spline,
double x,
float* restrict vals);
void
eval_multi_UBspline_1d_s_vg (const multi_UBspline_1d_s *spline,
double x,
float* restrict vals,
float* restrict grads);
void
eval_multi_UBspline_1d_s_vgl (const multi_UBspline_1d_s *spline,
double x,
float* restrict vals,
float* restrict grads,
float* restrict lapl);
void
eval_multi_UBspline_1d_s_vgh (const multi_UBspline_1d_s *spline,
double x,
float* restrict vals,
float* restrict grads,
float* restrict hess);
/************************************************************/
/* 2D single-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_2d_s(const multi_UBspline_2d_s *spline,
double x, double y,
float* restrict vals);
void
eval_multi_UBspline_2d_s_vg (const multi_UBspline_2d_s *spline,
double x, double y,
float* restrict vals,
float* restrict grads);
void
eval_multi_UBspline_2d_s_vgl (const multi_UBspline_2d_s *spline,
double x, double y,
float* restrict vals,
float* restrict grads,
float* restrict lapl);
void
eval_multi_UBspline_2d_s_vgh (const multi_UBspline_2d_s *spline,
double x, double y,
float* restrict vals,
float* restrict grads,
float* restrict hess);
/************************************************************/
/* 3D single-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_3d_s (const multi_UBspline_3d_s *spline,
double x, double y, double z,
float* restrict vals);
void
eval_multi_UBspline_3d_s_vg (const multi_UBspline_3d_s *spline,
double x, double y, double z,
float* restrict vals,
float* restrict grads);
void
eval_multi_UBspline_3d_s_vgl (const multi_UBspline_3d_s *spline,
double x, double y, double z,
float* restrict vals,
float* restrict grads,
float* restrict lapl);
void
eval_multi_UBspline_3d_s_vgh (const multi_UBspline_3d_s *spline,
double x, double y, double z,
float* restrict vals,
float* restrict grads,
float* restrict hess);
#endif

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std3_d_impl.h"

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std3_d_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_c_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_c_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_d_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_d_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_s_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_s_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_z_impl.h"

View File

@ -0,0 +1,119 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#ifndef MULTI_BSPLINE_EVAL_SSE_Z_H
#define MULTI_BSPLINE_EVAL_SSE_Z_H
/************************************************************/
/* 1D double-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_1d_z (multi_UBspline_1d_z *spline,
double x,
complex_double* restrict vals);
void
eval_multi_UBspline_1d_z_vg (multi_UBspline_1d_z *spline,
double x,
complex_double* restrict vals,
complex_double* restrict grads);
void
eval_multi_UBspline_1d_z_vgl (multi_UBspline_1d_z *spline,
double x,
complex_double* restrict vals,
complex_double* restrict grads,
complex_double* restrict lapl);
void
eval_multi_UBspline_1d_z_vgh (multi_UBspline_1d_z *spline,
double x,
complex_double* restrict vals,
complex_double* restrict grads,
complex_double* restrict hess);
/************************************************************/
/* 2D double-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_2d_z (multi_UBspline_2d_z *spline,
double x, double y,
complex_double* restrict vals);
void
eval_multi_UBspline_2d_z_vg (multi_UBspline_2d_z *spline,
double x, double y,
complex_double* restrict vals,
complex_double* restrict grads);
void
eval_multi_UBspline_2d_z_vgl (multi_UBspline_2d_z *spline,
double x, double y,
complex_double* restrict vals,
complex_double* restrict grads,
complex_double* restrict lapl);
void
eval_multi_UBspline_2d_z_vgh (multi_UBspline_2d_z *spline,
double x, double y,
complex_double* restrict vals,
complex_double* restrict grads,
complex_double* restrict hess);
/************************************************************/
/* 3D double-precision, complex evaulation functions */
/************************************************************/
void
eval_multi_UBspline_3d_z (multi_UBspline_3d_z *spline,
double x, double y, double z,
complex_double* restrict vals);
void
eval_multi_UBspline_3d_z_vg (multi_UBspline_3d_z *spline,
double x, double y, double z,
complex_double* restrict vals,
complex_double* restrict grads);
void
eval_multi_UBspline_3d_z_vgl (multi_UBspline_3d_z *spline,
double x, double y, double z,
complex_double* restrict vals,
complex_double* restrict grads,
complex_double* restrict lapl);
void
eval_multi_UBspline_3d_z_vgh (multi_UBspline_3d_z *spline,
double x, double y, double z,
complex_double* restrict vals,
complex_double* restrict grads,
complex_double* restrict hess);
void
eval_multi_UBspline_3d_z_vghgh (multi_UBspline_3d_z *spline,
double x, double y, double z,
complex_double* restrict vals,
complex_double* restrict grads,
complex_double* restrict hess,
complex_double* restrict gradhess);
#endif

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_sse_z_impl.h"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std2_d_impl.h"

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std2_d_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std3_d_impl.h"

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std3_d_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std_c_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std_c_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std_d_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std_d_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std_s_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std_s_impl.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
/////////////////////////////////////////////////////////////////////////////
// einspline: a library for creating and evaluating B-splines //
// Copyright (C) 2007 Kenneth P. Esler, Jr. //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation; either version 2 of the License, or //
// (at your option) any later version. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 51 Franklin Street, Fifth Floor, //
// Boston, MA 02110-1301 USA //
/////////////////////////////////////////////////////////////////////////////
#include "multi_bspline_eval_std_z_impl.h"

Some files were not shown because too many files have changed in this diff Show More