Merged changes from trunk to your_branch:svn merge -r 5441:5455 https://subversion.assembla.com/svn/qmcdev/trunk

git-svn-id: https://subversion.assembla.com/svn/qmcdev/branches/OptBF@5458 e5b18d87-469d-4833-9cc0-8cdfa06e9491
2012-03-24 21:10:38 +00:00 · 2012-03-24 21:10:38 +00:00 · 2f9b51a9c2
parent 0140365fbd e2d2718994
commit 2f9b51a9c2
156 changed files with 84700 additions and 41 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -457,11 +457,9 @@ INCLUDE_DIRECTORIES( ${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src)
 # - if everything fails, do not use it
 ######################################################################
 if(HAVE_EINSPLINE)
-  if(EINSPLINE_HOME)
  SUBDIRS(src/einspline)
  INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR}/include)
  SET(QMC_UTIL_LIBS ${QMC_UTIL_LIBS} einspline)
-  endif()
 else()
  INCLUDE(${PROJECT_CMAKE}/FindEinspline.cmake)
  if(EINSPLINE_FOUND)
--- a/config/RosaGNU.cmake
+++ b/config/RosaGNU.cmake
@ -0,0 +1,48 @@
+SET(CMAKE_SYSTEM_PROCESSOR "XK6")
+#2011-12-06
+
+set(CMAKE_C_COMPILER  /opt/cray/xt-asyncpe/5.06/bin/cc)
+set(CMAKE_CXX_COMPILER  /opt/cray/xt-asyncpe/5.06/bin/CC)
+set(GNU_OPTS "-DADD_ -DINLINE_ALL=inline")
+set(GNU_FLAGS "-fopenmp -O3 -Drestrict=__restrict__ -finline-limit=1000 -fstrict-aliasing -funroll-all-loops -Wno-deprecated ")
+#set(XT_FLAGS "-march=amdfam10 -msse3 -D_CRAYMPI")
+#set(XT_FLAGS " -msse3 -D_CRAYMPI") 
+#interlogs bdver1 but without it better
+set(XT_FLAGS "-march=bdver1  -msse3 -D_CRAYMPI") 
+set(CMAKE_CXX_FLAGS "${XT_FLAGS} ${GNU_FLAGS} -ftemplate-depth-60 ${GNU_OPTS}")
+set(CMAKE_C_FLAGS "${XT_FLAGS} ${GNU_FLAGS} -std=c99")
+
+SET(QMC_BUILD_STATIC 1)
+SET(ENABLE_OPENMP 1)
+SET(HAVE_MPI 1)
+SET(HAVE_SSE 1)
+SET(HAVE_SSE2 1)
+SET(HAVE_SSE3 1)
+SET(HAVE_SSSE3 1)
+SET(USE_PREFETCH 1)
+SET(PREFETCH_AHEAD 12)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_SHARED_LINKER_FLAGS "")
+
+FOREACH(type SHARED_LIBRARY SHARED_MODULE EXE)
+  SET(CMAKE_${type}_LINK_STATIC_C_FLAGS "-Wl,-Bstatic")
+  SET(CMAKE_${type}_LINK_DYNAMIC_C_FLAGS "-static")
+  SET(CMAKE_${type}_LINK_STATIC_CXX_FLAGS "-Wl,-Bstatic")
+  SET(CMAKE_${type}_LINK_DYNAMIC_CXX_FLAGS "-static")
+ENDFOREACH(type)
+
+set(CMAKE_FIND_ROOT_PATH
+  /opt/cray/hdf5/1.8.7/gnu/46
+  /opt/fftw/3.3.0.0/interlagos
+  /sw/xk6/boost/1.44.0/cle4.0_gnu4.5.3
+  /users/jnkim/xk6/libxml2
+  /apps/rosa/boost/1.47/gnu_453
+  )
+
+#set(EINSPLINE_HOME /ccs/proj/mat034/jnkim/share/einspline)
+set(HAVE_EINSPLINE 1)
+set(HAVE_EINSPLINE_EXT 0)
+link_libraries(/usr/lib64/libz.a)
--- a/src/OOMPI/stamp-h.in
+++ b/src/OOMPI/stamp-h.in
--- a/src/QMCWaveFunctions/EinsplineSet.cpp
+++ b/src/QMCWaveFunctions/EinsplineSet.cpp
@ -1817,7 +1817,7 @@ namespace qmcplusplus {
      }
      int psiIndex(0);
        for (int j=0; j<NumValenceOrbs; j++) {
-cerr<<psiIndex<<" "<<i<<" "<<j<<endl;
+//cerr<<psiIndex<<" "<<i<<" "<<j<<endl;
          psi(i,psiIndex)=real(StorageValueVector[j]);
          for (int n=0; n<OHMMS_DIM; n++)
            dpsi(i,psiIndex)[n] = real(StorageGradVector[j][n]);
--- a/src/QMCWaveFunctions/Fermion/BackflowBuilder.cpp
+++ b/src/QMCWaveFunctions/Fermion/BackflowBuilder.cpp
@ -46,7 +46,6 @@ namespace qmcplusplus

  BackflowBuilder::~BackflowBuilder() 
  {
-    delete myHandler;
  } 

    bool BackflowBuilder::put(xmlNodePtr cur)
--- a/src/einspline/CMakeLists.txt
+++ b/src/einspline/CMakeLists.txt
@ -25,71 +25,71 @@ SET(HFILES
 bspline_base_cuda.h
 )

-FOREACH(a ${HFILES})
-  #INSTALL(FILES ${EINSPLINE_HOME}/src/${a} DESTINATION ${PROJECT_BINARY_DIR}/include/einspline)
-  configure_file(${EINSPLINE_HOME}/src/${a} ${PROJECT_BINARY_DIR}/include/einspline/${a} COPYONLY)
-ENDFOREACH()
+#FOREACH(a ${HFILES})
+#  #INSTALL(FILES ${EINSPLINE_HOME}/src/${a} DESTINATION ${PROJECT_BINARY_DIR}/include/einspline)
+#  configure_file(${EINSPLINE_HOME}/src/${a} ${PROJECT_BINARY_DIR}/include/einspline/${a} COPYONLY)
+#ENDFOREACH()

 set(SRCS )

 SET(SRCS ${SRCS}
-  ${EINSPLINE_HOME}/src/bspline_create.c        
-  ${EINSPLINE_HOME}/src/bspline_data.c          
-  ${EINSPLINE_HOME}/src/multi_bspline_create.c  
-  ${EINSPLINE_HOME}/src/multi_nubspline_create.c
-  ${EINSPLINE_HOME}/src/nubspline_create.c      
-  ${EINSPLINE_HOME}/src/nubasis.c               
-  ${EINSPLINE_HOME}/src/nugrid.c                
+  bspline_create.c        
+  bspline_data.c          
+  multi_bspline_create.c  
+  multi_nubspline_create.c
+  nubspline_create.c      
+  nubasis.c               
+  nugrid.c                
 )

 IF(HAVE_SSE)
  SET(SRCS ${SRCS} 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_s.c   
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_c.c   
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_s_cpp.cc       
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_c_cpp.cc       
+    multi_bspline_eval_sse_s.c   
+    multi_bspline_eval_sse_c.c   
+    multi_bspline_eval_sse_s_cpp.cc       
+    multi_bspline_eval_sse_c_cpp.cc       
    )
 else()
  SET(SRCS ${SRCS} 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_s.c 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_c.c 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_s_cpp.cc       
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_c_cpp.cc       
+    multi_bspline_eval_std_s.c 
+    multi_bspline_eval_std_c.c 
+    multi_bspline_eval_std_s_cpp.cc       
+    multi_bspline_eval_std_c_cpp.cc       
    )
 endif()


 IF(HAVE_SSE2)
  SET(SRCS ${SRCS} 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_d.c   
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_z.c   
-    ${EINSPLINE_HOME}/src/multi_nubspline_eval_sse_z.c 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_d_cpp.cc       
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_sse_z_cpp.cc       
-    ${EINSPLINE_HOME}/src/multi_nubspline_eval_sse_z_cpp.cc   
+    multi_bspline_eval_sse_d.c   
+    multi_bspline_eval_sse_z.c   
+    multi_nubspline_eval_sse_z.c 
+    multi_bspline_eval_sse_d_cpp.cc       
+    multi_bspline_eval_sse_z_cpp.cc       
+    multi_nubspline_eval_sse_z_cpp.cc   
    )
 else()
  SET(SRCS ${SRCS} 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_d.c 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_z.c 
-    ${EINSPLINE_HOME}/src/multi_nubspline_eval_std_z.c 
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_d_cpp.cc       
-    ${EINSPLINE_HOME}/src/multi_bspline_eval_std_z_cpp.cc       
-    ${EINSPLINE_HOME}/src/multi_nubspline_eval_std_z_cpp.cc   
+    multi_bspline_eval_std_d.c 
+    multi_bspline_eval_std_z.c 
+    multi_nubspline_eval_std_z.c 
+    multi_bspline_eval_std_d_cpp.cc       
+    multi_bspline_eval_std_z_cpp.cc       
+    multi_nubspline_eval_std_z_cpp.cc   
    )
 endif()


 if(HAVE_CUDA)
  SET(SRCS  ${SRCS}
-    ${EINSPLINE_HOME}/src/multi_bspline_create_cuda.cu  
-    ${EINSPLINE_HOME}/src/bspline_create_cuda.cu 
+    multi_bspline_create_cuda.cu  
+    bspline_create_cuda.cu 
    )
  CUDA_ADD_LIBRARY(einspline ${SRCS})
 else()
  ADD_LIBRARY(einspline ${SRCS})
 endif()

-ADD_EXECUTABLE(time_multi ${EINSPLINE_HOME}/src/time_multi_new.c)
+ADD_EXECUTABLE(time_multi time_multi_new.c)
 target_link_libraries(time_multi einspline)
 #add_dependencies(time_multi ${PROJECT_BINARY_HOME}/include/einspline/bspline.h)
--- a/src/einspline/TestBspline.c
+++ b/src/einspline/TestBspline.c
@ -0,0 +1,844 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "bspline.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+double drand48();
+
+void
+Test_1d_s()
+{
+  Ugrid grid;
+  grid.start = 1.0;
+  grid.end   = 3.0;
+  grid.num = 11;
+  float data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
+  BCtype_s bc;
+  bc.lCode = DERIV2; bc.lVal = 10.0;
+  bc.rCode = DERIV2; bc.rVal = -10.0;
+  
+  FILE *fout = fopen ("1dSpline.dat", "w");
+  UBspline_1d_s *spline = (UBspline_1d_s*) create_UBspline_1d_s (grid, bc, data);
+  for (double x=1.0; x<=3.00001; x+=0.001) {
+    float val, grad, lapl;
+    eval_UBspline_1d_s_vgl (spline, x, &val, &grad, &lapl);
+    fprintf (fout, "%1.5f %20.14f %20.14f %20.14f\n", x, val, grad, lapl);
+  }
+  fclose (fout);
+}
+
+void
+Test_1d_d()
+{
+  Ugrid grid;
+  grid.start = 1.0;
+  grid.end   = 3.0;
+  grid.num = 1000;
+  //  double data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
+  double data[10000];
+  for (int i=0; i<10000; i++)
+    data[i] = -2.0 + 4.0*drand48();
+  BCtype_d bc;
+  bc.lCode = DERIV1; bc.lVal = 10.0;
+  bc.rCode = DERIV2; bc.rVal = -10.0;
+  
+  FILE *fout = fopen ("Spline_1d_d.dat", "w");
+  UBspline_1d_d *spline = 
+    (UBspline_1d_d*) create_UBspline_1d_d (grid, bc, data);
+  for (double x=1.0; x<=3.00001; x+=0.001) {
+    double val, grad, lapl;
+    eval_UBspline_1d_d_vgl (spline, x, &val, &grad, &lapl);
+    fprintf (fout, "%1.5f %20.14f %20.14f %20.14f\n", x, val, grad, lapl);
+  }
+  fclose (fout);
+}
+
+void
+Test_1d_d_antiperiodic()
+{
+  Ugrid grid;
+  grid.start = 1.0;
+  grid.end   = 3.0;
+  grid.num = 10;
+  //  double data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
+  double data[10];
+  for (int i=0; i<10; i++)
+    data[i] = -2.0 + 4.0*drand48();
+  BCtype_d bc;
+  bc.lCode = ANTIPERIODIC;
+  
+  FILE *fout = fopen ("Spline_1d_d_antiperiodic.dat", "w");
+  UBspline_1d_d *spline = 
+    (UBspline_1d_d*) create_UBspline_1d_d (grid, bc, data);
+  for (double x=1.0; x<=5.00001; x+=0.001) {
+    double val, grad, lapl;
+    double xp = x;
+    double sign = 1.0;
+    while (xp >= grid.end) {
+      xp -= (grid.end-grid.start);
+      sign *= -1.0;
+    }
+    eval_UBspline_1d_d_vgl (spline, xp, &val, &grad, &lapl);
+    fprintf (fout, "%1.5f %20.14f %20.14f %20.14f\n", x, sign*val, sign*grad, sign*lapl);
+  }
+  double val, grad, lapl;
+  double x = grid.start + (grid.end-grid.start) * (double)1/(double)grid.num;
+  eval_UBspline_1d_d_vgl (spline, x, &val, &grad, &lapl);
+  fclose (fout);
+}
+
+
+void
+Speed_1d_s()
+{
+  Ugrid grid;
+  grid.start = 1.0;
+  grid.end   = 3.0;
+  grid.num = 11;
+  float data[] = { 3.0, -4.0, 2.0, 1.0, -2.0, 0.0, 3.0, 2.0, 0.5, 1.0, 3.0 };
+  BCtype_s bc;
+  bc.lCode = DERIV2; bc.lVal = 10.0;
+  bc.rCode = DERIV2; bc.rVal = -10.0;
+  UBspline_1d_s *spline = (UBspline_1d_s*) create_UBspline_1d_s (grid, bc, data);
+
+  float val, grad, lapl;
+  clock_t start, end, rstart, rend;
+
+  rstart = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = grid.start + 0.99999*drand48()*(grid.end-grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = grid.start + 0.99999*drand48()*(grid.end-grid.start);
+    eval_UBspline_1d_s_vgl (spline, x, &val, &grad, &lapl);
+  }
+  end = clock();
+  fprintf (stderr, "100,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+void
+Test_2d_s()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 30;
+  
+  float *data = malloc (x_grid.num * y_grid.num * sizeof(float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
+  BCtype_s x_bc, y_bc;
+  x_bc.lCode = PERIODIC; x_bc.lVal = 10.0;
+  x_bc.rCode = PERIODIC; x_bc.rVal = -10.0;
+  y_bc.lCode = PERIODIC; y_bc.lVal = 10.0;
+  y_bc.rCode = PERIODIC; y_bc.rVal = -10.0;
+  
+  UBspline_2d_s *spline = (UBspline_2d_s*) create_UBspline_2d_s (x_grid, y_grid, x_bc, y_bc, data); 
+
+  FILE *fout = fopen ("2dspline.dat", "w");
+  for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
+      float val, grad[2], hess[4];
+	eval_UBspline_2d_s_vgh (spline, x, y, &val, grad, hess);
+      fprintf (fout, "%20.14f ", val);
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+
+  int ix=5;
+  int iy=7;
+  float exval = data[ix*y_grid.num+iy];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta;
+  float spval, grad[2], hess[4];
+  eval_UBspline_2d_s_vgh (spline, x, y, &spval, grad, hess);
+  fprintf (stderr, "exval = %20.15f   spval = %20.15f\n", exval, spval);
+
+}
+
+void
+Speed_2d_s()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 300;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 300;
+  
+  float *data = malloc (x_grid.num * y_grid.num * sizeof(float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
+  BCtype_s x_bc, y_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
+  
+  UBspline_2d_s *spline = (UBspline_2d_s*) create_UBspline_2d_s (x_grid, y_grid, x_bc, y_bc, data); 
+  float val, grad[2], hess[4];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    eval_UBspline_2d_s_vgh (spline, x, y, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+void
+Test_2d_c()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 30;
+  
+  complex_float *data = malloc (x_grid.num * y_grid.num * sizeof(complex_float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = 
+	-1.0 + 2.0*drand48() + 1.0fI*(-1.0 + 2.0*drand48());
+  BCtype_c x_bc, y_bc;
+  x_bc.lCode = PERIODIC;  x_bc.rCode = PERIODIC;
+  y_bc.lCode = PERIODIC;  y_bc.rCode = PERIODIC;
+  
+  UBspline_2d_c *spline = (UBspline_2d_c*) create_UBspline_2d_c (x_grid, y_grid, x_bc, y_bc, data); 
+
+  FILE *fout = fopen ("2dspline.dat", "w");
+  for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
+      complex_float val, grad[2], hess[4];
+      eval_UBspline_2d_c_vgh (spline, x, y, &val, grad, hess);
+      fprintf (fout, "%20.14f %20.15f ", crealf(val), cimagf(val));
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+
+  int ix=5;
+  int iy=7;
+  complex_float exval = data[ix*y_grid.num+iy];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta;
+  complex_float spval, grad[2], hess[4];
+  eval_UBspline_2d_c_vgh (spline, x, y, &spval, grad, hess);
+  fprintf (stderr, "exval = (%20.15f + %20.15fi)   spval = (%20.15f + %20.15fi)\n", 
+	   crealf(exval), cimagf(exval), creal(spval), cimagf(spval));
+
+}
+
+void
+Speed_2d_c()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 300;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 300;
+  
+  complex_float *data = malloc (x_grid.num * y_grid.num * sizeof(complex_float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = 
+	-1.0 + 2.0*drand48() + 1.0fI*(-1.0 + 2.0*drand48());
+  BCtype_c x_bc, y_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
+  
+  UBspline_2d_c *spline = (UBspline_2d_c*) create_UBspline_2d_c (x_grid, y_grid, x_bc, y_bc, data); 
+  complex_float val, grad[2], hess[4];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    eval_UBspline_2d_c_vgh (spline, x, y, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+void
+Test_2d_d()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 30;
+  
+  double *data = malloc (x_grid.num * y_grid.num * sizeof(double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
+  BCtype_d x_bc, y_bc;
+  x_bc.lCode = PERIODIC;  x_bc.rCode = PERIODIC;
+  y_bc.lCode = PERIODIC;  y_bc.rCode = PERIODIC;
+  
+  UBspline_2d_d *spline = 
+    create_UBspline_2d_d (x_grid, y_grid, x_bc, y_bc, data); 
+
+  FILE *fout = fopen ("2dspline.dat", "w");
+  for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
+      double val, grad[2], hess[4];
+      eval_UBspline_2d_d_vgh (spline, x, y, &val, grad, hess);
+      fprintf (fout, "%20.14f ", val);
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+  
+  int ix=5;
+  int iy=7;
+  double exval = data[ix*y_grid.num+iy];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta;
+  double spval, grad[2], hess[4];
+  eval_UBspline_2d_d_vgh (spline, x, y, &spval, grad, hess);
+  fprintf (stderr, "exval = %20.15f   spval = %20.15f\n", exval, spval);
+
+}
+
+void
+Speed_2d_d()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 300;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 300;
+  
+  double *data = malloc (x_grid.num * y_grid.num * sizeof(double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = -1.0 + 2.0*drand48();
+  BCtype_d x_bc, y_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
+  
+  UBspline_2d_d *spline = (UBspline_2d_d*) create_UBspline_2d_d (x_grid, y_grid, x_bc, y_bc, data); 
+  double val, grad[2], hess[4];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    eval_UBspline_2d_d_vgh (spline, x, y, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+void
+Test_2d_z()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 30;
+  
+  complex_double *data = malloc (x_grid.num * y_grid.num * sizeof(complex_double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = 
+	-1.0 + 2.0*drand48() + 1.0I*(-1.0 + 2.0*drand48());
+  BCtype_z x_bc, y_bc;
+  x_bc.lCode = PERIODIC;  x_bc.rCode = PERIODIC;
+  y_bc.lCode = PERIODIC;  y_bc.rCode = PERIODIC;
+  
+  UBspline_2d_z *spline = 
+    create_UBspline_2d_z (x_grid, y_grid, x_bc, y_bc, data); 
+
+  FILE *fout = fopen ("2dspline.dat", "w");
+  for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
+      complex_double val, grad[2], hess[4];
+      eval_UBspline_2d_z_vgh (spline, x, y, &val, grad, hess);
+      fprintf (fout, "%20.14f %20.14f ", creal(val), cimag(val));
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+  
+  int ix=5;
+  int iy=7;
+  complex_double exval = data[ix*y_grid.num+iy];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta;
+  complex_double spval, grad[2], hess[4];
+  eval_UBspline_2d_z_vgh (spline, x, y, &spval, grad, hess);
+  fprintf (stderr, "exval = (%20.15f + %20.15fi)   spval = (%20.15f + %20.15fi)\n", 
+	   creal(exval), cimag(exval), creal(spval), cimag(spval));
+
+}
+
+void
+Speed_2d_z()
+{
+  Ugrid x_grid, y_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 300;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 300;
+  
+  complex_double *data = malloc (x_grid.num * y_grid.num * sizeof(complex_double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      *(data + ix*y_grid.num + iy) = 
+	-1.0 + 2.0*drand48() + 1.0I*(-1.0 + 2.0*drand48());
+  BCtype_z x_bc, y_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC;
+  
+  UBspline_2d_z *spline = (UBspline_2d_z*) create_UBspline_2d_z (x_grid, y_grid, x_bc, y_bc, data); 
+  complex_double val, grad[2], hess[4];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    eval_UBspline_2d_z_vgh (spline, x, y, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "100,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+
+void
+Test_3d_s()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0001;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.0001;  y_grid.num = 30;
+  z_grid.start = 1.0;  z_grid.end   = 3.0001;  z_grid.num = 30;
+  
+  float *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
+  BCtype_s x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_s *spline = (UBspline_3d_s*) create_UBspline_3d_s 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+  double z = 1.92341;
+  FILE *fout = fopen ("3dspline.dat", "w");
+  for (double x=x_grid.start; x<x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<y_grid.end; y+=0.005) {
+      float val, grad[3], hess[9], lapl;
+      eval_UBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
+      fprintf (fout, "%20.14f ", val);
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+
+  int ix=9;  int iy=19; int iz = 24;
+  float exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta + 0.000001;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta + 0.000001;
+  z =        z_grid.start + (double)iz * spline->z_grid.delta + 0.000001;
+  float spval, grad[3], hess[9], lapl;
+  eval_UBspline_3d_s_vgh (spline, x, y, z, &spval, grad, hess);
+  fprintf (stderr, "exval = %20.15f   spval = %20.15f\n", exval, spval);
+
+}
+
+
+void
+Speed_3d_s()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 200;
+  y_grid.start = 1.0;  y_grid.end   = 5.0;  y_grid.num = 200;
+  z_grid.start = 1.0;  z_grid.end   = 7.0;  z_grid.num = 200;
+  
+  float *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
+  BCtype_s x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_s *spline = (UBspline_3d_s*) create_UBspline_3d_s 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+  float val, grad[3], hess[9];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+    eval_UBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+void
+Test_3d_d()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 30;
+  z_grid.start = 1.0;  z_grid.end   = 3.0;  z_grid.num = 30;
+  
+  double *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
+  BCtype_d x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_d *spline = (UBspline_3d_d*) create_UBspline_3d_d 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+  double z = 1.92341;
+  FILE *fout = fopen ("3dspline.dat", "w");
+  for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
+      double val, grad[3], hess[9];
+      eval_UBspline_3d_d_vgh (spline, x, y, z, &val, grad, hess);
+      fprintf (fout, "%23.17f ", val);
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+
+  int ix=9;  int iy=19; int iz = 24;
+  double exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta;
+  z =        z_grid.start + (double)iz * spline->z_grid.delta;
+  double spval, grad[3], hess[9];
+  eval_UBspline_3d_d_vgh (spline, x, y, z, &spval, grad, hess);
+  fprintf (stderr, "exval = %23.17f   spval = %23.17f\n", exval, spval);
+
+}
+
+
+void
+Speed_3d_d()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 200;
+  y_grid.start = 1.0;  y_grid.end   = 5.0;  y_grid.num = 200;
+  z_grid.start = 1.0;  z_grid.end   = 7.0;  z_grid.num = 200;
+  
+  double *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = -1.0 + 2.0*drand48();
+  BCtype_d x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_d *spline = (UBspline_3d_d*) create_UBspline_3d_d 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+  double val, grad[3], hess[9];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+    eval_UBspline_3d_d_vgh (spline, x, y, z, &val, grad, hess);
+    // eval_UBspline_3d_d (spline, x, y, z, &val);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+void
+Test_3d_c()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.0;  y_grid.num = 30;
+  z_grid.start = 1.0;  z_grid.end   = 3.0;  z_grid.num = 30;
+  
+  complex_float *data = 
+    malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = 
+	  (-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
+  BCtype_c x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_c *spline = create_UBspline_3d_c 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+   double z = 1.92341; 
+  FILE *fout = fopen ("3dspline.dat", "w");
+  for (double x=x_grid.start; x<0.99999*x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<y_grid.end; y+=0.005) {
+      complex_float val, grad[3], hess[9];
+      eval_UBspline_3d_c_vgh (spline, x, y, z, &val, grad, hess);
+      fprintf (fout, "%23.17f %23.17f ", crealf(val), cimagf(val));
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+
+  int ix=9;  int iy=18; int iz = 24;
+  complex_float exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta;
+  z =        z_grid.start + (double)iz * spline->z_grid.delta;
+  complex_float spval, grad[3], hess[9];
+  eval_UBspline_3d_c_vgh (spline, x, y, z, &spval, grad, hess);
+  fprintf (stderr, "exval = (%23.17f + %23.17fi)\nspval = (%23.17f + %23.17fi)\n", 
+	   crealf(exval), cimagf(exval), crealf(spval), cimagf(spval));
+
+}
+
+
+void
+Speed_3d_c()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 200;
+  y_grid.start = 1.0;  y_grid.end   = 5.0;  y_grid.num = 200;
+  z_grid.start = 1.0;  z_grid.end   = 7.0;  z_grid.num = 200;
+  
+  complex_float *data = malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_float));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = 
+	  (-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
+  BCtype_c x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_c *spline = (UBspline_3d_c*) create_UBspline_3d_c 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+  complex_float val, grad[3], hess[9];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+    eval_UBspline_3d_c_vgh (spline, x, y, z, &val, grad, hess);
+    //eval_UBspline_3d_c     (spline, x, y, z, &val);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+void
+Test_3d_z()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.4;  x_grid.num = 30;
+  y_grid.start = 1.0;  y_grid.end   = 3.7;  y_grid.num = 30;
+  z_grid.start = 1.0;  z_grid.end   = 3.9;  z_grid.num = 30;
+  
+  complex_double *data = 
+    malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = 
+	  (-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
+  BCtype_z x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_z *spline = create_UBspline_3d_z 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+  double z = 1.92341;
+  FILE *fout = fopen ("3dspline.dat", "w");
+  for (double x=x_grid.start; x<=x_grid.end; x+=0.005) {
+    for (double y=y_grid.start; y<=y_grid.end; y+=0.005) {
+      complex_double val, grad[3], hess[9];
+      eval_UBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
+      fprintf (fout, "%23.19f %23.19f ", crealf(hess[4]), cimagf(hess[4]));
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+
+  int ix=9;  int iy=19; int iz = 25;
+  complex_double exval = data[(ix*y_grid.num+iy)*z_grid.num+iz];
+  double x = x_grid.start + (double)ix * spline->x_grid.delta;
+  double y = y_grid.start + (double)iy * spline->y_grid.delta;
+  z =        z_grid.start + (double)iz * spline->z_grid.delta;
+  complex_double spval, grad[3], hess[9];
+  eval_UBspline_3d_z_vgh (spline, x, y, z, &spval, grad, hess);
+  fprintf (stderr, "exval = (%23.19f + %23.19fi)\nspval = (%23.17f + %23.17fi)\n", 
+	   crealf(exval), cimagf(exval), crealf(spval), cimagf(spval));
+
+}
+
+
+void
+Speed_3d_z()
+{
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 1.0;  x_grid.end   = 3.0;  x_grid.num = 200;
+  y_grid.start = 1.0;  y_grid.end   = 5.0;  y_grid.num = 200;
+  z_grid.start = 1.0;  z_grid.end   = 7.0;  z_grid.num = 200;
+  
+  complex_double *data = 
+    malloc (x_grid.num * y_grid.num * z_grid.num * sizeof(complex_double));
+  for (int ix=0; ix<x_grid.num; ix++)
+    for (int iy=0; iy<y_grid.num; iy++)
+      for (int iz=0; iz<z_grid.num; iz++)
+	*(data + ((ix*y_grid.num) + iy)*z_grid.num + iz) = 
+	  (-1.0 + 2.0*drand48()) + (-1.0 + 2.0*drand48())*1.0fI;
+  BCtype_z x_bc, y_bc, z_bc;
+  x_bc.lCode = PERIODIC; x_bc.rCode = PERIODIC; 
+  y_bc.lCode = PERIODIC; y_bc.rCode = PERIODIC; 
+  z_bc.lCode = PERIODIC; z_bc.rCode = PERIODIC; 
+  
+  UBspline_3d_z *spline = (UBspline_3d_z*) create_UBspline_3d_z 
+    (x_grid, y_grid, z_grid, x_bc, y_bc, z_bc, data); 
+
+  complex_double val, grad[3], hess[9];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid.start+ 0.9999*drand48()*(x_grid.end - x_grid.start);
+    double y = y_grid.start+ 0.9999*drand48()*(y_grid.end - y_grid.start);
+    double z = z_grid.start+ 0.9999*drand48()*(z_grid.end - z_grid.start);
+    eval_UBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+#ifdef F77_DUMMY_MAIN
+#  ifdef __cplusplus
+     extern "C"
+#  endif
+   int F77_DUMMY_MAIN() { return 1; }
+#endif
+
+int main()
+{
+  Test_1d_s();
+  Test_1d_d();
+  Test_1d_d_antiperiodic();
+  // Speed_1d_s();
+  Test_2d_s();
+  // Speed_2d_s();
+  Test_2d_c();
+  // Speed_2d_c();
+  Test_2d_d();
+  // Speed_2d_d();
+   Test_2d_z();
+  // Speed_2d_z();
+  Test_3d_s();
+  // Speed_3d_s();
+  Test_3d_d();
+  // Speed_3d_d();
+  Test_3d_c();
+  // Speed_3d_c();
+  Test_3d_z();
+  Speed_3d_z();
+}
--- a/src/einspline/TestNUBspline.c
+++ b/src/einspline/TestNUBspline.c
@ -0,0 +1,686 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "nubspline.h"
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433
+#endif
+
+double drand48();
+
+void
+PrintPassFail(bool pass)
+{
+  if (pass)
+    // Print green "Passed"
+    fprintf (stderr, "%c[32mPassed%c[0m\n", 0x1B, 0x1B);
+  else
+    // Print red "Failed"
+    fprintf (stderr, "%c[31mFailed%c[0m\n", 0x1B, 0x1B);
+}
+
+void PrintTest (char *name, bool pass)
+{
+  int n = strlen (name);
+  fprintf (stderr, "%s:", name);
+  for (int i=n; i<57; i++)
+    fprintf (stderr, " ");
+  PrintPassFail (pass);
+}
+
+
+bool
+TestCenterGrid()
+{
+  fprintf (stderr, "Testing CenterGrid:   ");
+  bool passed = true;
+  NUgrid* grid = create_center_grid (-5.0, 7.0, 6.0, 200);
+
+  for (int i=0; i<10000; i++) {
+    double x = -5.0+12.0*drand48();
+    int lo = (*grid->reverse_map)(grid, x);
+    assert (x >= grid->points[lo]);
+    assert (x <= grid->points[lo+1]);
+  }
+  PrintPassFail (passed);
+  return passed;
+}
+
+
+bool
+TestGeneralGrid()
+{
+  fprintf (stderr, "Testing GeneralGrid:  ");
+  bool passed = true;
+  NUgrid* centgrid = create_center_grid (-5.0, 7.0, 6.0, 200);
+  NUgrid* grid = create_general_grid (centgrid->points, 200);
+  for (int i=0; i<10000; i++) {
+    double x = -5.0+12.0*drand48();
+    int lo = (*grid->reverse_map)(grid, x);
+    passed = passed && (x >= grid->points[lo]);
+    passed = passed && (x <= grid->points[lo+1]);
+  }
+  PrintPassFail (passed);
+  return passed;
+}
+
+bool
+close_float (float x, float y)
+{
+  float max = fmaxf (x, y);
+  return (fabs(x-y)/max < 1.0e-5);
+}
+
+bool
+TestNUB_1d_s()
+{
+  double start = -5.0;
+  double end = 7.0;
+  int N  = 200;
+  NUgrid* grid = create_center_grid (start, end, 6.0, N);
+  bool passed = true;
+  float data[N];
+  for (int i=0; i<N; i++) 
+    data[i] = -1.0 + 2.0*drand48();
+  BCtype_s bc;
+
+  // Create spline with PBC
+  fprintf (stderr, "Testing 1D single-precision periodic boundary conditions:\n");
+  bc.lCode = PERIODIC; bc.rCode = PERIODIC;
+  NUBspline_1d_s *periodic = create_NUBspline_1d_s (grid, bc, data);
+  float sval, sgrad, slapl, eval, egrad, elapl;
+  eval_NUBspline_1d_s_vgl (periodic, start, &sval, &sgrad, &slapl);
+  eval_NUBspline_1d_s_vgl (periodic, end  , &eval, &egrad, &elapl);
+  bool v_passed, grad_passed, lapl_passed;
+  v_passed    = close_float (sval, eval);
+  grad_passed = close_float (sgrad, egrad);
+  lapl_passed = close_float (slapl, elapl);
+  PrintTest ("Value", v_passed);
+  PrintTest ("First derivative", grad_passed);
+  PrintTest ("Second derivative", lapl_passed);
+  passed = passed && v_passed && grad_passed && lapl_passed;
+
+  double x = grid->points[26];
+  float val;
+  eval_NUBspline_1d_s (periodic, x, &val);
+  bool interp_passed = close_float (val, data[26]);
+  PrintTest ("Interpolation", interp_passed);
+  passed = passed && interp_passed;
+
+  // Create spline with fixed first derivative:
+  bc.lCode = DERIV1; bc.lVal = 1.5;
+  bc.rCode = DERIV1; bc.rVal = -0.3;
+  NUBspline_1d_s *fixed_first = create_NUBspline_1d_s (grid, bc, data);
+  fprintf (stderr, "Testing 1D single-precsion fixed first derivative boundary conditions:  \n");
+  eval_NUBspline_1d_s_vg (fixed_first, start, &sval, &sgrad);
+  eval_NUBspline_1d_s_vg (fixed_first,   end, &eval, &egrad);
+  bool bc_passed = close_float (sgrad, 1.5) && close_float (egrad, -0.3);
+  PrintTest ("Boundary conditions", bc_passed);
+  x = grid->points[26];
+  eval_NUBspline_1d_s (periodic, x, &val);
+  interp_passed = close_float (val, data[26]);
+  PrintTest ("Interpolation", interp_passed);
+  passed = passed && interp_passed && bc_passed;
+
+  // Create spline with fixed second derivative:
+  bc.lCode = DERIV2; bc.lVal = 1.5;
+  bc.rCode = DERIV2; bc.rVal = -0.3;
+  NUBspline_1d_s *fixed_second = create_NUBspline_1d_s (grid, bc, data);
+  fprintf (stderr, "Testing 1d_s fixed second derivative boundary conditions:  \n");
+  eval_NUBspline_1d_s_vgl (fixed_second, start, &sval, &sgrad, &slapl);
+  eval_NUBspline_1d_s_vgl (fixed_second,   end, &eval, &egrad, &elapl);
+  bc_passed = close_float (slapl, 1.5) && close_float (elapl, -0.3);
+  fprintf (stderr, "slapl = %1.8f  elapl = %1.8f\n", slapl, elapl);
+  PrintTest ("Boundary conditions", bc_passed);
+  x = grid->points[26];
+  eval_NUBspline_1d_s (periodic, x, &val);
+  interp_passed = close_float (val, data[26]);
+  PrintTest ("Interpolation", interp_passed);
+  passed = passed && interp_passed && bc_passed;
+
+  return passed;
+}
+
+void
+GridSpeedTest()
+{
+  NUgrid* centgrid = create_center_grid (-5.0, 7.0, 6.0, 2000);
+  NUgrid* gengrid = create_general_grid (centgrid->points, 2000);
+  int centsum=0, gensum=0;
+  
+  clock_t rstart, rend, cstart, cend, gstart, gend;
+  
+  rstart = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = -5.0 + 12.0*drand48();
+  }
+  rend = clock();
+
+  cstart = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = -5.0 + 12.0*drand48();
+    centsum += (*centgrid->reverse_map)(centgrid, x);
+  }
+  cend = clock();
+
+  gstart = clock();
+  for (int i=0; i<100000000; i++) {
+    double x = -5.0 + 12.0*drand48();
+    gensum += (*gengrid->reverse_map)(gengrid, x);
+  }
+  gend = clock();
+  
+  double cent_time = (double)(cend-cstart+rstart-rend)/(double)CLOCKS_PER_SEC;
+  double gen_time  = (double)(gend-gstart+rstart-rend)/(double)CLOCKS_PER_SEC;
+  fprintf (stderr, "%d %d\n", centsum, gensum);
+  fprintf (stderr, "center_grid  time = %1.3f s.\n", cent_time);
+  fprintf (stderr, "general_grid time = %1.3f s.\n", gen_time);
+}
+
+void
+TestNUBasis()
+{
+  NUgrid* centgrid = create_center_grid (-5.0, 7.0, 10.0, 20);
+  NUBasis* basis = create_NUBasis (centgrid, true);
+
+  double bfuncs[4];
+  for (double x=-5.0; x<=7.0; x+=0.001) {
+    get_NUBasis_funcs_d (basis, x, bfuncs);
+    fprintf (stderr, "%1.12f %1.12f %1.12f %1.12f %1.12f\n",
+	     x, bfuncs[0], bfuncs[1], bfuncs[2], bfuncs[3]);
+  }
+}
+
+void
+TestNUBspline()
+{
+  NUgrid* centgrid = create_center_grid (-5.0, 7.0, 10.0, 20);
+  NUBasis* basis = create_NUBasis (centgrid, true);
+  float data[20];
+  for (int i=0; i<20; i++) {
+    double x = centgrid->points[i];
+    double angle = (x+5.0)/12.0 * 2.0*M_PI;
+    data[i] = sin(angle);
+  }
+  BCtype_s bc;
+  //  bc.lCode = PERIODIC;  bc.rCode = PERIODIC;
+  bc.lCode = DERIV1; bc.lVal = 2.0*M_PI/12.0;
+  bc.rCode = DERIV1; bc.rVal = 2.0*M_PI/12.0;
+  //bc.lCode = NATURAL;  bc.rCode = FLAT;
+  NUBspline_1d_s *spline = create_NUBspline_1d_s (centgrid, bc, data);
+  for (double x=-5.0; x<=7.0; x+=0.001) {
+    float val, deriv;
+    eval_NUBspline_1d_s_vg (spline, x, &val, &deriv);
+    double angle = (x+5.0)/12.0 * 2.0*M_PI;
+    fprintf (stderr, "%1.16e %1.16e %1.16e %1.16e\n", x, val, 
+	     sin(angle), deriv);
+  }
+}
+
+
+void
+TestNUBspline_d()
+{
+  NUgrid* centgrid = create_center_grid (-5.0, 7.0, 10.0, 20);
+  NUBasis* basis = create_NUBasis (centgrid, true);
+  double data[20];
+  for (int i=0; i<20; i++) {
+    double x = centgrid->points[i];
+    double angle = (x+5.0)/12.0 * 2.0*M_PI;
+    data[i] = sin(angle);
+  }
+  BCtype_d bc;
+  //  bc.lCode = PERIODIC;  bc.rCode = PERIODIC;
+  bc.lCode = DERIV1; bc.lVal = 2.0*M_PI/12.0;
+  bc.rCode = DERIV1; bc.rVal = 2.0*M_PI/12.0;
+  //bc.lCode = NATURAL;  bc.rCode = FLAT;
+  NUBspline_1d_d *spline = create_NUBspline_1d_d (centgrid, bc, data);
+  for (double x=-5.0; x<=7.0; x+=0.001) {
+    double val, deriv;
+    eval_NUBspline_1d_d_vg (spline, x, &val, &deriv);
+    double angle = (x+5.0)/12.0 * 2.0*M_PI;
+    fprintf (stderr, "%1.16e %1.16e %1.16e %1.16e\n", x, val, 
+	     sin(angle), deriv);
+  }
+}
+
+
+void
+TestNUB_2d_s()
+{
+  int Mx=30, My=35;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
+  float data[Mx*My];
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      data[ix*My+iy] = -1.0+2.0*drand48();
+  
+  BCtype_s xBC, yBC;
+  xBC.lCode = PERIODIC;
+  yBC.lCode = PERIODIC;
+//   xBC.lCode = FLAT;  xBC.rCode = FLAT;
+//   yBC.lCode = FLAT;  yBC.rCode = FLAT;
+
+  NUBspline_2d_s *spline = create_NUBspline_2d_s (x_grid, y_grid, xBC, yBC, data);
+  
+  int xFine = 400;
+  int yFine = 400;
+  FILE *fout = fopen ("2d_s.dat", "w");
+  double xi = x_grid->start;
+  double xf = x_grid->end;// + x_grid->points[1] - x_grid->points[0];
+  double yi = y_grid->start;
+  double yf = y_grid->end;// + y_grid->points[1] - y_grid->points[0];
+  for (int ix=0; ix<xFine; ix++) {
+    double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
+    for (int iy=0; iy<yFine; iy++) {
+      double y = yi + (double)iy/(double)(yFine)*(yf-yi);
+      float val;
+      eval_NUBspline_2d_s (spline, x, y, &val);
+      fprintf (fout, "%1.16e ", val);
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+}
+
+
+void
+TestNUB_2d_c()
+{
+  int Mx=30, My=35;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
+  complex_float data[Mx*My];
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      data[ix*My+iy] = -1.0+2.0*drand48() + 1.0fi*(-1.0+2.0*drand48());
+  
+  BCtype_c xBC, yBC;
+  xBC.lCode = PERIODIC;
+  yBC.lCode = PERIODIC;
+//   xBC.lCode = FLAT;  xBC.rCode = FLAT;
+//   yBC.lCode = FLAT;  yBC.rCode = FLAT;
+
+  NUBspline_2d_c *spline = create_NUBspline_2d_c (x_grid, y_grid, xBC, yBC, data);
+  
+  int xFine = 400;
+  int yFine = 400;
+  FILE *rout = fopen ("2d_r.dat", "w");
+  FILE *iout = fopen ("2d_i.dat", "w");
+  double xi = x_grid->start;
+  double xf = x_grid->end;// + x_grid->points[1] - x_grid->points[0];
+  double yi = y_grid->start;
+  double yf = y_grid->end;// + y_grid->points[1] - y_grid->points[0];
+  for (int ix=0; ix<xFine; ix++) {
+    double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
+    for (int iy=0; iy<yFine; iy++) {
+      double y = yi + (double)iy/(double)(yFine)*(yf-yi);
+      complex_float val, grad[2], hess[4];
+      eval_NUBspline_2d_c_vgh (spline, x, y, &val, grad, hess);
+      fprintf (rout, "%1.16e ", crealf(val));
+      fprintf (iout, "%1.16e ", cimagf(val));
+    }
+    fprintf (rout, "\n");
+    fprintf (iout, "\n");
+  }
+  fclose (rout);
+  fclose (iout);
+}
+
+void
+TestNUB_3d_s()
+{
+  int Mx=20, My=27, Mz=23;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0,  7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0,  3.5, My);
+  NUgrid *z_grid = create_center_grid (-1.8, 2.0,  2.8, Mz);
+  float data[Mx*My*Mz];
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      for (int iz=0; iz<Mz; iz++)
+	data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48();
+  
+  BCtype_s xBC, yBC, zBC;
+//   xBC.lCode = PERIODIC;
+//   yBC.lCode = PERIODIC;
+  xBC.lCode = PERIODIC;  xBC.rCode = PERIODIC;
+  yBC.lCode = PERIODIC;  yBC.rCode = PERIODIC;
+  zBC.lCode = PERIODIC;  zBC.rCode = PERIODIC;
+
+  NUBspline_3d_s *spline = create_NUBspline_3d_s (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
+  
+  int xFine = 200, yFine = 200, zFine=200;
+  FILE *fout = fopen ("3d_s.dat", "w");
+  double xi = x_grid->start;  double xf = x_grid->end;
+  double yi = y_grid->start;  double yf = y_grid->end;
+  double zi = z_grid->start;  double zf = z_grid->end;
+  for (int ix=0; ix<xFine; ix++) {
+    double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
+    for (int iy=0; iy<yFine; iy++) {
+      double y = yi + (double)iy/(double)(yFine)*(yf-yi);
+      for (int iz=0; iz<zFine; iz++) {
+	double z = zi + (double)iz/(double)(zFine)*(zf-zi);
+	float val, grad[3], hess[9];
+	eval_NUBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
+	fprintf (fout, "%1.16e ", val);
+      }
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+  fprintf (stderr, "spline->sp_code = %d\n", spline->sp_code);
+  destroy_Bspline (spline);
+}
+
+
+void
+TestNUB_3d_d()
+{
+  int Mx=20, My=27, Mz=23;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0,  7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0,  3.5, My);
+  NUgrid *z_grid = create_center_grid (-1.8, 2.0,  2.8, Mz);
+  double data[Mx*My*Mz];
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      for (int iz=0; iz<Mz; iz++)
+	data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48();
+  
+  BCtype_d xBC, yBC, zBC;
+//   xBC.lCode = PERIODIC;
+//   yBC.lCode = PERIODIC;
+  xBC.lCode = PERIODIC;  xBC.rCode = PERIODIC;
+  yBC.lCode = PERIODIC;  yBC.rCode = PERIODIC;
+  zBC.lCode = PERIODIC;  zBC.rCode = PERIODIC;
+
+  NUBspline_3d_d *spline = create_NUBspline_3d_d (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
+  
+  int xFine = 200, yFine = 200, zFine=200;
+  FILE *fout = fopen ("3d_d.dat", "w");
+  double xi = x_grid->start;  double xf = x_grid->end;
+  double yi = y_grid->start;  double yf = y_grid->end;
+  double zi = z_grid->start;  double zf = z_grid->end;
+  for (int ix=0; ix<xFine; ix++) {
+    double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
+    for (int iy=0; iy<yFine; iy++) {
+      double y = yi + (double)iy/(double)(yFine)*(yf-yi);
+      for (int iz=0; iz<zFine; iz++) {
+	double z = zi + (double)iz/(double)(zFine)*(zf-zi);
+	double val, grad[3], hess[9];
+	eval_NUBspline_3d_d_vgh (spline, x, y, z, &val, grad, hess);
+	fprintf (fout, "%1.16e ", val);
+      }
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+  fprintf (stderr, "spline->sp_code = %d\n", spline->sp_code);
+  destroy_Bspline (spline);
+}
+
+void
+TestNUB_3d_c()
+{
+  int Mx=20, My=27, Mz=23;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0,  7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0,  3.5, My);
+  NUgrid *z_grid = create_center_grid (-1.8, 2.0,  2.8, Mz);
+  complex_float data[Mx*My*Mz];
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      for (int iz=0; iz<Mz; iz++)
+	data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48() + 1.0if*(-1.0+2.0*drand48());
+  
+  BCtype_c xBC, yBC, zBC;
+//   xBC.lCode = PERIODIC;
+//   yBC.lCode = PERIODIC;
+  xBC.lCode = PERIODIC;  xBC.rCode = PERIODIC;
+  yBC.lCode = PERIODIC;  yBC.rCode = PERIODIC;
+  zBC.lCode = PERIODIC;  zBC.rCode = PERIODIC;
+
+  NUBspline_3d_c *spline = create_NUBspline_3d_c (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
+  
+  int xFine = 200, yFine = 200, zFine=200;
+  FILE *rout = fopen ("3d_r.dat", "w");
+  FILE *iout = fopen ("3d_i.dat", "w");
+  double xi = x_grid->start;  double xf = x_grid->end;
+  double yi = y_grid->start;  double yf = y_grid->end;
+  double zi = z_grid->start;  double zf = z_grid->end;
+  for (int ix=0; ix<xFine; ix++) {
+    double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
+    for (int iy=0; iy<yFine; iy++) {
+      double y = yi + (double)iy/(double)(yFine)*(yf-yi);
+      for (int iz=0; iz<zFine; iz++) {
+	double z = zi + (double)iz/(double)(zFine)*(zf-zi);
+	complex_float val, grad[3], hess[9];
+	eval_NUBspline_3d_c_vgh (spline, x, y, z, &val, grad, hess);
+	fprintf (rout, "%1.16e ", crealf(val));
+	fprintf (iout, "%1.16e ", cimagf(val));
+      }
+    }
+    fprintf (rout, "\n");
+    fprintf (iout, "\n");
+  }
+  fclose (rout);
+  fclose (iout);
+}
+
+
+void
+TestNUB_3d_z()
+{
+  int Mx=20, My=27, Mz=23;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0,  7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0,  3.5, My);
+  NUgrid *z_grid = create_center_grid (-1.8, 2.0,  2.8, Mz);
+  complex_double data[Mx*My*Mz];
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      for (int iz=0; iz<Mz; iz++)
+	data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48() + 1.0if*(-1.0+2.0*drand48());
+  
+  BCtype_z xBC, yBC, zBC;
+//   xBC.lCode = PERIODIC;
+//   yBC.lCode = PERIODIC;
+  xBC.lCode = PERIODIC;  xBC.rCode = PERIODIC;
+  yBC.lCode = PERIODIC;  yBC.rCode = PERIODIC;
+  zBC.lCode = PERIODIC;  zBC.rCode = PERIODIC;
+
+  NUBspline_3d_z *spline = create_NUBspline_3d_z (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
+  
+  int xFine = 200, yFine = 200, zFine=200;
+  FILE *rout = fopen ("3d_r.dat", "w");
+  FILE *iout = fopen ("3d_i.dat", "w");
+  double xi = x_grid->start;  double xf = x_grid->end;
+  double yi = y_grid->start;  double yf = y_grid->end;
+  double zi = z_grid->start;  double zf = z_grid->end;
+  for (int ix=0; ix<xFine; ix++) {
+    double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
+    for (int iy=0; iy<yFine; iy++) {
+      double y = yi + (double)iy/(double)(yFine)*(yf-yi);
+      for (int iz=0; iz<zFine; iz++) {
+	double z = zi + (double)iz/(double)(zFine)*(zf-zi);
+	complex_double val, grad[3], hess[9];
+	eval_NUBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
+	fprintf (rout, "%1.16e ", crealf(val));
+	fprintf (iout, "%1.16e ", cimagf(val));
+      }
+    }
+    fprintf (rout, "\n");
+    fprintf (iout, "\n");
+  }
+  fclose (rout);
+  fclose (iout);
+}
+
+void
+SpeedNUB_3d_s()
+{
+  int Mx=200, My=200, Mz=200;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0,  1.0001, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0,  1.0001, My);
+  NUgrid *z_grid = create_center_grid (-1.8, 2.0,  1.0001, Mz);
+  float *data;
+  data = malloc (sizeof(float)*Mx*My*Mz);
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      for (int iz=0; iz<Mz; iz++)
+	data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48();
+  
+  BCtype_s xBC, yBC, zBC;
+//   xBC.lCode = PERIODIC;
+//   yBC.lCode = PERIODIC;
+  xBC.lCode = PERIODIC;  xBC.rCode = PERIODIC;
+  yBC.lCode = PERIODIC;  yBC.rCode = PERIODIC;
+  zBC.lCode = PERIODIC;  zBC.rCode = PERIODIC;
+
+  NUBspline_3d_s *spline = create_NUBspline_3d_s (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
+ 
+  float val, grad[3], hess[9];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
+    double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
+    double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
+    double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
+    double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
+    eval_NUBspline_3d_s_vgh (spline, x, y, z, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+void
+SpeedNUB_3d_z()
+{
+  int Mx=200, My=200, Mz=200;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0,  7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0,  3.5, My);
+  NUgrid *z_grid = create_center_grid (-1.8, 2.0,  2.8, Mz);
+  complex_double *data = malloc (sizeof(complex_double)*Mx*My*Mz);
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      for (int iz=0; iz<Mz; iz++)
+	data[(ix*My+iy)*Mz+iz] = -1.0+2.0*drand48() + 1.0if*(-1.0+2.0*drand48());
+  
+  BCtype_z xBC, yBC, zBC;
+  xBC.lCode = PERIODIC;  xBC.rCode = PERIODIC;
+  yBC.lCode = PERIODIC;  yBC.rCode = PERIODIC;
+  zBC.lCode = PERIODIC;  zBC.rCode = PERIODIC;
+
+  NUBspline_3d_z *spline = create_NUBspline_3d_z (x_grid, y_grid, z_grid, xBC, yBC, zBC, data);
+  complex_double val, grad[3], hess[9];
+  clock_t start, end, rstart, rend;
+  rstart = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
+    double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
+    double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
+  }
+  rend = clock();
+  start = clock();
+  for (int i=0; i<10000000; i++) {
+    double x = x_grid->start+ 0.9999*drand48()*(x_grid->end - x_grid->start);
+    double y = y_grid->start+ 0.9999*drand48()*(y_grid->end - y_grid->start);
+    double z = z_grid->start+ 0.9999*drand48()*(z_grid->end - z_grid->start);
+    eval_NUBspline_3d_z_vgh (spline, x, y, z, &val, grad, hess);
+  }
+  end = clock();
+  fprintf (stderr, "10,000,000 evalations in %f seconds.\n", 
+	   (double)(end-start-(rend-rstart))/(double)CLOCKS_PER_SEC);
+}
+
+
+void
+TestNUB_2d_d()
+{
+  int Mx=30, My=35;
+  NUgrid *x_grid = create_center_grid (-3.0, 4.0, 7.5, Mx);
+  NUgrid *y_grid = create_center_grid (-1.0, 9.0, 3.5, My);
+  double data[Mx*My];
+  for (int ix=0; ix<Mx; ix++)
+    for (int iy=0; iy<My; iy++)
+      data[ix*My+iy] = -1.0+2.0*drand48();
+  
+  BCtype_d xBC, yBC;
+  xBC.lCode = PERIODIC;
+  yBC.lCode = PERIODIC;
+//   xBC.lCode = FLAT;  xBC.rCode = FLAT;
+//   yBC.lCode = FLAT;  yBC.rCode = FLAT;
+
+
+
+  NUBspline_2d_d *spline = create_NUBspline_2d_d (x_grid, y_grid, xBC, yBC, data);
+  
+  int xFine = 400;
+  int yFine = 400;
+  FILE *fout = fopen ("2d_d.dat", "w");
+  double xi = x_grid->start;
+  double xf = x_grid->end;// + x_grid->points[1] - x_grid->points[0];
+  double yi = y_grid->start;
+  double yf = y_grid->end;// + y_grid->points[1] - y_grid->points[0];
+  for (int ix=0; ix<xFine; ix++) {
+    double x = xi+ (double)ix/(double)(xFine)*(xf-xi);
+    for (int iy=0; iy<yFine; iy++) {
+      double y = yi + (double)iy/(double)(yFine)*(yf-yi);
+      double val;
+      eval_NUBspline_2d_d (spline, x, y, &val);
+      fprintf (fout, "%1.16e ", val);
+    }
+    fprintf (fout, "\n");
+  }
+  fclose (fout);
+}
+
+int main()
+{
+  // TestCenterGrid();
+  // TestGeneralGrid();
+  // GridSpeedTest();
+  // TestNUBasis();
+  // TestNUBasis();
+  TestNUBspline_d();
+  // TestNUB_2d_s();
+  //  TestNUB_2d_c();
+  // TestNUB_3d_c();
+  //  SpeedNUB_3d_s();
+  // TestNUB_2d_d();
+  // TestNUB_3d_d();
+  // TestNUB_3d_z();
+  //SpeedNUB_3d_z();
+  //  bool passed = TestNUB_1d_s();
+}
+
--- a/src/einspline/aligned_alloc.h
+++ b/src/einspline/aligned_alloc.h
@ -0,0 +1,49 @@
+#ifndef ALIGNED_ALLOC_H
+#define ALIGNED_ALLOC_H
+
+#include <stdlib.h>
+#include "config.h"
+
+#ifdef HAVE_POSIX_MEMALIGN
+inline void *
+aligned_alloc (size_t size, size_t alignment)
+{
+  void *ptr;
+  posix_memalign (&ptr, alignment, size);
+  return ptr;
+}
+
+inline void
+aligned_free (void *ptr)
+{
+  free (ptr);
+}
+
+#else
+
+inline void *
+aligned_alloc (size_t size, size_t alignment)
+{
+  size += (alignment-1)+sizeof(void*);
+  void *ptr = malloc (size);
+  if (ptr == NULL)
+    return NULL;
+  else {
+    void *shifted = ptr + sizeof(void*);
+    size_t offset = alignment - (size_t)shifted%(size_t)alignment;
+    void *aligned = shifted + offset;
+    *((void**)aligned-1) = ptr;
+    return aligned;
+  }
+}
+
+inline void 
+aligned_free (void *aligned)
+{
+  void *ptr = *((void**)aligned-1);
+  free (ptr);
+}
+#endif
+
+
+#endif
--- a/src/einspline/blip_create.c
+++ b/src/einspline/blip_create.c
@ -0,0 +1,176 @@
+#include "blip_create.h"
+#include <math.h>
+#include <complex.h>
+#include <fftw3.h>
+#include "config.h"
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#define _XOPEN_SOURCE 600
+#include <stdlib.h>
+#include <math.h>
+#include <aligned_alloc.h>
+
+void init_sse_data();
+
+inline 
+void* FFTAlign (void* ptr)
+{
+  size_t offset = 16 - (size_t)((size_t)ptr)&0x0f;
+  return (void*) ((size_t)ptr+offset);
+}
+
+inline double dot (double a[3], double b[3])
+{
+  return (a[0]*b[0] + a[1]*b[1] + a[2]*b[2]);
+}
+
+// This function creates a single-precision real blip function from a
+// set of plane-wave coefficients.  lattice is a 3x3 array specifying
+// the lattice vectors.  The first lattice vector is given
+// contiguously at latice[0], the second at lattice[3], and the third
+// at lattice[6].  The next is a list of 3D G-vectors in the format:
+// G_x[0] G_y[0] G_z[0], G_x[1], G_y[1], G_z[1],...
+// Next, complex plane-wave coefficents are given, one for each
+// G-vector.  Next, the number of G-vectors is given, followed by
+// a factor which increases the density of the real-space grid.  A
+// factor of 1.0 uses the minimum density to avoid aliasing.  Finally,
+// the last parameter specifies whether to take the real or imaginary part.
+// The spline is constructed to have domain [0,1) for x, y, and z coordinates. 
+UBspline_3d_s*
+create_blip_3d_s (double *lattice, double *Gvecs, 
+		  complex_float *coefs, int numG,
+		  double factor, bool useReal)
+{
+  int max_ix=0, max_iy=0, max_iz=0;
+  int Mx, My, Mz;
+  double twoPiInv = 1.0/(2.0*M_PI);
+  for (int i=0; i<numG; i++) {
+    double *G = Gvecs+3*i;
+    int ix = round (twoPiInv * dot (lattice+0, G));
+    int iy = round (twoPiInv * dot (lattice+3, G));
+    int iz = round (twoPiInv * dot (lattice+6, G));
+    if (abs(ix) > max_ix)   max_ix = ix;
+    if (abs(iy) > max_iy)   max_iy = iy;
+    if (abs(iz) > max_iz)   max_iz = iz;
+  }
+  Mx = 4*max_ix + 1;
+  My = 4*max_iy + 1;
+  Mz = 4*max_iz + 1;
+  Mx = (int) ceil(factor*Mx);
+  My = (int) ceil(factor*My);
+  Mz = (int) ceil(factor*Mz);
+
+  // FFTs are a little faster with even dimensions.
+  if ((Mx%2)==1) Mx++;
+  if ((My%2)==1) My++;
+  if ((Mz%2)==1) Mz++;
+
+  fprintf (stderr, "(Mx, My, Mz) = (%d, %d, %d)\n", Mx, My, Mz);
+
+  // Now allocate space for FFT box
+  complex_float *fft_box, *alloc_ptr;
+  fft_box = aligned_alloc (sizeof(complex_float)*Mx*My*Mz, 16);
+
+  // Create FFTW plan
+  fftwf_plan plan = 
+  fftwf_plan_dft_3d (Mx, My, Mz, (fftwf_complex*)fft_box, (fftwf_complex*)fft_box, 1,
+		     FFTW_ESTIMATE);
+  
+  // Zero-out fft-box
+  for (int i=0; i<Mx*My*Mz; i++)
+    fft_box[i] = (complex_float)0.0f;
+  
+  // Now fill in fft box with coefficients in the right places
+  double MxInv = 1.0/(double)Mx;
+  double MyInv = 1.0/(double)My;
+  double MzInv = 1.0/(double)Mz;
+  double scale = 1.0/3.375;
+  for (int i=0; i<numG; i++) {
+    double *g = Gvecs+3*i;
+    double G[3];
+    G[0] = MxInv*(lattice[0]*g[0] + lattice[3]*g[1] + lattice[6]*g[2]);
+    G[1] = MyInv*(lattice[1]*g[0] + lattice[4]*g[1] + lattice[7]*g[2]);
+    G[2] = MzInv*(lattice[2]*g[0] + lattice[5]*g[1] + lattice[8]*g[2]);
+    int ix = round (twoPiInv * dot (lattice+0, g));
+    int iy = round (twoPiInv * dot (lattice+3, g));
+    int iz = round (twoPiInv * dot (lattice+6, g));
+    ix = (ix + Mx)%Mx;
+    iy = (iy + My)%My;
+    iz = (iz + Mz)%Mz;
+    double gamma = 1.0;
+    if (fabs(G[0]) > 1.0e-10)
+      gamma *= (3.0/(G[0]*G[0]*G[0]*G[0])*(3.0 - 4.0*cos(G[0]) + cos(2.0*G[0])));
+    else
+      gamma *= 1.5;
+    if (fabs(G[1]) > 1.0e-10)
+      gamma *= (3.0/(G[1]*G[1]*G[1]*G[1])*(3.0 - 4.0*cos(G[1]) + cos(2.0*G[1])));
+    else
+      gamma *= 1.5;
+    if (fabs(G[2]) > 1.0e-10)
+      gamma *= (3.0/(G[2]*G[2]*G[2]*G[2])*(3.0 - 4.0*cos(G[2]) + cos(2.0*G[2])));
+    else
+      gamma *= 1.5;
+    gamma *= scale;
+    fft_box[(ix*My+iy)*Mz+iz] = coefs[i]/gamma;
+  }
+  
+  // Execute the FFTW plan
+  fftwf_execute (plan);
+  // Destroy plan
+  fftwf_destroy_plan (plan);
+
+  // Now we have the coefficients in the FFT box.  We must allocate a
+  // little bit larger box to hold the B-spline coefficients
+  UBspline_3d_s* restrict spline = malloc (sizeof (UBspline_3d_s));
+  spline->spcode = U3D;
+  spline->tcode  = SINGLE_REAL;
+  Ugrid x_grid, y_grid, z_grid;
+  int Nx = Mx + 3;
+  int Ny = My + 3;
+  int Nz = Mz + 3;
+  x_grid.start = 0.0;  x_grid.end = 1.0;  x_grid.num = Mx;  
+  x_grid.delta = 1.0/(double)Mx;    x_grid.delta_inv = 1.0/x_grid.delta;
+  y_grid.start = 0.0;  y_grid.end = 1.0;  y_grid.num = My;  
+  y_grid.delta = 1.0/(double)My;    y_grid.delta_inv = 1.0/y_grid.delta;
+  z_grid.start = 0.0;  z_grid.end = 1.0;  z_grid.num = Mz;  
+  z_grid.delta = 1.0/(double)Mz;      z_grid.delta_inv = 1.0/z_grid.delta;
+  spline->x_grid = x_grid;
+  spline->y_grid = y_grid;
+  spline->z_grid = z_grid;
+  spline->x_stride = Ny*Nz;
+  spline->y_stride = Nz;
+  spline->xBC.lCode = PERIODIC;  spline->xBC.rCode = PERIODIC;
+  spline->yBC.lCode = PERIODIC;  spline->yBC.rCode = PERIODIC;
+  spline->zBC.lCode = PERIODIC;  spline->zBC.rCode = PERIODIC;
+  
+#ifndef HAVE_SSE2
+  spline->coefs      = malloc (sizeof(float)*Nx*Ny*Nz);
+#else
+  posix_memalign ((void**)&spline->coefs, 16, sizeof(float)*Nx*Ny*Nz);
+#endif 
+
+  // Now copy data into spline coefficients, observing periodic boundary conditions
+  for (int ix=0; ix<Nx; ix++) {
+    int jx = (ix-1 + Mx)%Mx;
+    for (int iy=0; iy < Ny; iy++) {
+      int jy = (iy-1 + My)%My;
+      for (int iz=0; iz < Nz; iz++) {
+	int jz = (iz-1 + Mz)%Mz;
+	if (useReal)
+	  spline->coefs[(ix*Ny+iy)*Nz+iz] = 
+	    crealf (fft_box[(jx*My+jy)*Mz+jz]);
+	else
+	  spline->coefs[(ix*Ny+iy)*Nz+iz] = 
+	    cimagf (fft_box[(jx*My+jy)*Mz+jz]);	
+      }
+    }
+  }
+      
+  //free (alloc_ptr);
+  aligned_free (fft_box);
+
+  init_sse_data();
+  return spline;
+}
--- a/src/einspline/blip_create.h
+++ b/src/einspline/blip_create.h
@ -0,0 +1,56 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BLIP_CREATE_H
+#define BLIP_CREATE_H
+
+#include "bspline_base.h"
+#include "bspline_structs.h"
+#include <stdbool.h>
+
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////              Blip creation functions               ////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+
+UBspline_3d_s*
+create_blip_3d_s (double *lattice, double *Gvecs, 
+		  complex_float *coefs, int numG,
+		  double factor, bool useReal);
+
+UBspline_3d_d*
+create_blip_3d_d (double *lattice, double *Gvecs, 
+		  complex_double *coefs, int numG,
+		  double factor, bool useReal);
+
+UBspline_3d_c*
+create_blip_3d_c (double *lattice, double *Gvecs, 
+		  complex_float *coefs, int numG,
+		  double factor);
+
+UBspline_3d_z*
+create_blip_3d_z (double *lattice, double *Gvecs, 
+		  complex_double *coefs, int numG,
+		  double factor);
+
+
+
+#endif
--- a/src/einspline/bspline.h
+++ b/src/einspline/bspline.h
@ -0,0 +1,58 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_H
+#define BSPLINE_H
+
+#include "bspline_base.h"
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////           Bspline structure definitions            ////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+#include "bspline_structs.h"
+#include "multi_bspline_structs.h"
+
+// Currently, some of the single-precision routines use SSE2 instructions
+#ifdef HAVE_SSE2
+#include "bspline_eval_sse_s.h"
+#include "bspline_eval_sse_c.h"
+#include "bspline_eval_sse_d.h"
+#include "bspline_eval_sse_z.h"
+#elif defined HAVE_SSE
+#include "bspline_eval_sse_s.h"
+#include "bspline_eval_sse_c.h"
+#include "bspline_eval_std_d.h"
+#include "bspline_eval_std_z.h"
+#elif defined USE_ALTIVEC
+#include "bspline_eval_altivec_s.h"
+#include "bspline_eval_std_c.h"
+#include "bspline_eval_std_d.h"
+#include "bspline_eval_std_z.h"
+#else
+#include "bspline_eval_std_s.h"
+#include "bspline_eval_std_c.h"
+#include "bspline_eval_std_d.h"
+#include "bspline_eval_std_z.h"
+#endif
+
+#include "bspline_create.h"
+#include "multi_bspline_create.h"
+#endif
--- a/src/einspline/bspline_base.h
+++ b/src/einspline/bspline_base.h
@ -0,0 +1,104 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_BASE_H
+#define BSPLINE_BASE_H
+
+#include "config.h"
+
+#ifdef __cplusplus
+#include <complex>
+typedef std::complex<float>  complex_float;
+typedef std::complex<double> complex_double;
+#else
+#include <complex.h>
+typedef complex float  complex_float;
+typedef complex double complex_double;
+#endif
+
+// Conventions:
+// Postfixes:  
+// s:  single precision real
+// d:  double precision real
+// c:  single precision complex
+// z:  double precision complex
+
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////              Basic type declarations               ////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+
+typedef enum { PERIODIC, DERIV1, DERIV2, FLAT, NATURAL, ANTIPERIODIC } bc_code;
+typedef enum { U1D       , U2D       , U3D      , 
+	       NU1D      , NU2D      , NU3D     ,
+               MULTI_U1D , MULTI_U2D , MULTI_U3D,
+               MULTI_NU1D, MULTI_NU2D, MULTI_NU3D } spline_code;
+typedef enum { SINGLE_REAL, DOUBLE_REAL, SINGLE_COMPLEX, DOUBLE_COMPLEX }
+  type_code;
+
+typedef struct 
+{
+  bc_code lCode, rCode;
+  float lVal, rVal;
+} BCtype_s;
+
+typedef struct 
+{
+  bc_code lCode, rCode;
+  double lVal, rVal;
+} BCtype_d;
+
+typedef struct 
+{
+  bc_code lCode, rCode;
+  float lVal_r, lVal_i, rVal_r, rVal_i;
+} BCtype_c;
+
+typedef struct 
+{
+  bc_code lCode, rCode;
+  double lVal_r, lVal_i, rVal_r, rVal_i;
+} BCtype_z;
+
+
+typedef struct
+{
+  double start, end;
+  int num;
+
+  // private
+  double delta, delta_inv;
+} Ugrid;
+
+typedef struct
+{
+  spline_code sp_code;
+  type_code   t_code;
+  void *restrict coefs;
+} Bspline;
+
+#ifdef __cplusplus 
+extern "C" 
+#endif
+void
+destroy_Bspline (void *spline);
+
+#endif
--- a/src/einspline/bspline_base_cuda.h
+++ b/src/einspline/bspline_base_cuda.h
@ -0,0 +1,38 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007-2010 Kenneth P. Esler, Jr.                          //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_BASE_CUDA_H
+#define BSPLINE_BASE_CUDA_H
+
+#include <cuda.h>
+
+#if CUDA_VERSION < 3000 /* 3.0 */
+typedef struct
+{
+  double x,y,z;
+} double3;
+
+typedef struct
+{
+  double x,y,z,w;
+} double4;
+#endif
+
+#endif
--- a/src/einspline/bspline_create.c
+++ b/src/einspline/bspline_create.c
--- a/src/einspline/bspline_create.h
+++ b/src/einspline/bspline_create.h
@ -0,0 +1,153 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_CREATE_H
+#define BSPLINE_CREATE_H
+
+#include "bspline_base.h"
+#include "bspline_structs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////              Spline creation functions             ////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+
+/////////////////////////////////////
+// Uniform, single precision, real //
+/////////////////////////////////////
+// Create 1D uniform single-precision, real Bspline
+UBspline_1d_s *
+create_UBspline_1d_s (Ugrid x_grid, BCtype_s xBC, float *data);
+
+// Create 2D uniform single-precision, real Bspline
+UBspline_2d_s *
+create_UBspline_2d_s (Ugrid x_grid,   Ugrid y_grid,
+		      BCtype_s   xBC, BCtype_s   yBC,
+		      float *data);
+
+// Create 3D uniform single-precision, real Bspline
+UBspline_3d_s *
+create_UBspline_3d_s (Ugrid x_grid,   Ugrid y_grid,   Ugrid z_grid,
+		      BCtype_s  xBC,  BCtype_s   yBC, BCtype_s   zBC,
+		      float *data);
+
+void
+recompute_UBspline_1d_s (UBspline_1d_s* spline, float *data);
+
+void
+recompute_UBspline_2d_s (UBspline_2d_s* spline, float *data);
+
+void
+recompute_UBspline_3d_s (UBspline_3d_s* spline, float *data);
+
+/////////////////////////////////////
+// Uniform, double precision, real //
+/////////////////////////////////////
+// Create 1D uniform single-precision, real Bspline
+UBspline_1d_d *
+create_UBspline_1d_d (Ugrid x_grid, BCtype_d xBC, double *data);
+
+// Create 2D uniform single-precision, real Bspline
+UBspline_2d_d *
+create_UBspline_2d_d (Ugrid x_grid,   Ugrid y_grid,
+		      BCtype_d   xBC, BCtype_d   yBC,
+		      double *data);
+
+// Create 3D uniform single-precision, real Bspline
+UBspline_3d_d *
+create_UBspline_3d_d (Ugrid x_grid,   Ugrid   y_grid,   Ugrid z_grid,
+		      BCtype_d  xBC,  BCtype_d   yBC, BCtype_d   zBC,
+		      double *data);
+
+void
+recompute_UBspline_1d_d (UBspline_1d_d* spline, double *data);
+
+void
+recompute_UBspline_2d_d (UBspline_2d_d* spline, double *data);
+
+void
+recompute_UBspline_3d_d (UBspline_3d_d* spline, double *data);
+
+///////////////////////////////////////
+// Uniform, single precision, complex//
+///////////////////////////////////////
+// Create 1D uniform single-precision, real Bspline
+UBspline_1d_c *
+create_UBspline_1d_c (Ugrid x_grid, BCtype_c xBC, complex_float *data);
+
+// Create 2D uniform single-precision, real Bspline
+UBspline_2d_c *
+create_UBspline_2d_c (Ugrid   x_grid, Ugrid   y_grid,
+		      BCtype_c   xBC, BCtype_c   yBC,
+		      complex_float *data);
+
+// Create 3D uniform single-precision, real Bspline
+UBspline_3d_c *
+create_UBspline_3d_c (Ugrid  x_grid, Ugrid y_grid, Ugrid z_grid,
+		      BCtype_c  xBC, BCtype_c yBC, BCtype_c zBC,
+		      complex_float *data);
+
+void
+recompute_UBspline_1d_c (UBspline_1d_c* spline, complex_float *data);
+
+void
+recompute_UBspline_2d_c (UBspline_2d_c* spline, complex_float *data);
+
+void
+recompute_UBspline_3d_c (UBspline_3d_c* spline, complex_float *data);
+ 
+///////////////////////////////////////
+// Uniform, double precision, complex//
+///////////////////////////////////////
+// Create 1D uniform double-precision, complex Bspline
+UBspline_1d_z *
+create_UBspline_1d_z (Ugrid x_grid, BCtype_z xBC, complex_double *data);
+
+// Create 2D uniform double-precision, complex Bspline
+UBspline_2d_z *
+create_UBspline_2d_z (Ugrid x_grid, Ugrid y_grid,
+		      BCtype_z   xBC, BCtype_z   yBC,
+		      complex_double *data);
+
+// Create 3D uniform double-precision, complex Bspline
+UBspline_3d_z *
+create_UBspline_3d_z (Ugrid  x_grid, Ugrid   y_grid, Ugrid z_grid,
+		      BCtype_z  xBC, BCtype_z   yBC, BCtype_z zBC,
+		      complex_double *data);
+
+void
+recompute_UBspline_1d_z (UBspline_1d_z* spline, complex_double *data);
+
+void
+recompute_UBspline_2d_z (UBspline_2d_z* spline, complex_double *data);
+
+void
+recompute_UBspline_3d_z (UBspline_3d_z* spline, complex_double *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/einspline/bspline_create_cuda.cu
+++ b/src/einspline/bspline_create_cuda.cu
@ -0,0 +1,403 @@
+#include <stdio.h>
+
+#include "bspline_base.h"
+#include "bspline_structs.h"
+#include "bspline_structs_cuda.h"
+
+__device__ double Bcuda[48];
+__constant__ float  Acuda[48];
+
+// #include "bspline_cuda_s_impl.h"
+// #include "bspline_cuda_c_impl.h"
+// #include "bspline_cuda_d_impl.h"
+// #include "bspline_cuda_z_impl.h"
+
+extern "C" UBspline_3d_c_cuda*
+create_UBspline_3d_c_cuda (UBspline_3d_c* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  UBspline_3d_c_cuda *cuda_spline =
+    (UBspline_3d_c_cuda*) malloc (sizeof (UBspline_3d_c_cuda));
+  
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = ((Nz+31)/32)*32;
+
+  cuda_spline->stride.x = Ny*N;
+  cuda_spline->stride.y = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*N*sizeof(std::complex<float>);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++) {
+      for (int iz=0; iz<Nz; iz++) 
+	spline_buff[ix*cuda_spline->stride.x +
+		    iy*cuda_spline->stride.y + iz] =
+	  spline->coefs[ix*spline->x_stride +
+			iy*spline->y_stride +iz]; 
+      for (int isp=Nz; isp < N; isp++) {
+	spline_buff[ix*cuda_spline->stride.x +
+		    iy*cuda_spline->stride.y + isp] = 0.0;
+      }
+    }
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  free(spline_buff);
+  
+  cuda_spline->stride.x = 2*Ny*N;
+  cuda_spline->stride.y = 2*N;
+
+  return cuda_spline;
+}
+
+
+extern "C" UBspline_3d_c_cuda*
+create_UBspline_3d_c_cuda_conv (UBspline_3d_z* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  UBspline_3d_c_cuda *cuda_spline =
+    (UBspline_3d_c_cuda*) malloc (sizeof (UBspline_3d_c_cuda));
+  
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = ((Nz+31)/32) * 32;
+  cuda_spline->stride.x = Ny*N;
+  cuda_spline->stride.y = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*N*sizeof(std::complex<float>);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) {
+	std::complex<double> z = spline->coefs[ix*spline->x_stride +
+					       iy*spline->y_stride + iz];
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y + iz] = std::complex<float>(z.real(), z.imag());
+	for (int iz=Nz; iz < N; iz++) 
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y + iz] = 0.0;
+      }
+
+	
+
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  free(spline_buff);
+
+  cuda_spline->stride.x = 2*Ny*N;
+  cuda_spline->stride.y = 2*N;
+
+  return cuda_spline;
+}
+
+
+
+
+extern "C" UBspline_3d_s_cuda*
+create_UBspline_3d_s_cuda (UBspline_3d_s* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  UBspline_3d_s_cuda *cuda_spline =
+    (UBspline_3d_s_cuda*) malloc (sizeof (UBspline_3d_s_cuda));
+  
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = ((Nz+31)/32)*32;
+
+  cuda_spline->stride.x = Ny*N;
+  cuda_spline->stride.y = N;
+  cuda_spline->stride.z = 1;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  float *spline_buff = (float*)malloc(size);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	spline_buff[ix*cuda_spline->stride.x +
+		    iy*cuda_spline->stride.y + iz] = 
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride + iz];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  free(spline_buff);
+
+  return cuda_spline;
+}
+
+
+
+extern "C" UBspline_3d_s_cuda*
+create_UBspline_3d_s_cuda_conv (UBspline_3d_d* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  UBspline_3d_s_cuda *cuda_spline =
+    (UBspline_3d_s_cuda*) malloc (sizeof (UBspline_3d_s_cuda));
+  
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = ((Nz+31)/32)*32;
+  cuda_spline->stride.x = Ny*N;
+  cuda_spline->stride.y = N;
+  cuda_spline->stride.z = 1;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "Failed to allocate %ld memory for GPU spline coefficients.  Error %s\n",
+	     size, cudaGetErrorString(err));
+    abort();
+  }
+  
+  float *spline_buff = (float*)malloc(size);
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y + iz] = 
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride + iz];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "Failed to copy spline to GPU memory.  Error:  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+  free(spline_buff);
+
+  return cuda_spline;
+}
+
+
+
+
+extern "C" UBspline_3d_d_cuda*
+create_UBspline_3d_d_cuda (UBspline_3d_d* spline)
+{
+  double B_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
+
+  UBspline_3d_d_cuda *cuda_spline =
+    (UBspline_3d_d_cuda*) malloc (sizeof (UBspline_3d_d_cuda));
+  
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = ((Nz+31)/32)*32;
+  cuda_spline->stride.x = Ny*N;
+  cuda_spline->stride.y = N;
+  cuda_spline->stride.z = 1;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*N*sizeof(double);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  double *spline_buff = (double*)malloc(size);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y + iz] = 
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride + iz];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  free(spline_buff);
+
+  return cuda_spline;
+}
+
+
+
+extern "C" UBspline_3d_z_cuda*
+create_UBspline_3d_z_cuda (UBspline_3d_z* spline)
+{
+  double B_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
+
+  UBspline_3d_z_cuda *cuda_spline =
+    (UBspline_3d_z_cuda*) malloc (sizeof (UBspline_3d_z_cuda));
+  
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = ((Nz+31)/32)*32;
+  cuda_spline->stride.x = Ny*N;
+  cuda_spline->stride.y = N;  
+  cuda_spline->stride.z = 1;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*N*sizeof(std::complex<double>);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  std::complex<double> *spline_buff = (std::complex<double>*)malloc(size);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y + iz] = 
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride + iz];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  cuda_spline->stride.x = 2*Ny*N;
+  cuda_spline->stride.y = 2*N;
+  cuda_spline->stride.z = 2;
+
+  free(spline_buff);
+
+  return cuda_spline;
+}
--- a/src/einspline/bspline_create_cuda.h
+++ b/src/einspline/bspline_create_cuda.h
@ -0,0 +1,26 @@
+#ifndef BSPLINE_CREATE_CUDA_H
+#define BSPLINE_CREATE_CUDA_H
+
+#include "bspline_structs_cuda.h"
+
+extern "C" UBspline_3d_s_cuda*
+create_UBspline_3d_s_cuda (UBspline_3d_s* spline);
+
+extern "C" UBspline_3d_s_cuda*
+create_UBspline_3d_s_cuda_conv (UBspline_3d_d* spline);
+
+
+extern "C" UBspline_3d_c_cuda*
+create_UBspline_3d_c_cuda (UBspline_3d_c* spline);
+
+extern "C" UBspline_3d_c_cuda*
+create_UBspline_3d_c_cuda_conv (UBspline_3d_z* spline);
+
+
+extern "C" UBspline_3d_d_cuda*
+create_UBspline_3d_d_cuda (UBspline_3d_d* spline);
+
+extern "C" UBspline_3d_z_cuda*
+create_UBspline_3d_z_cuda (UBspline_3d_z* spline);
+
+#endif
--- a/src/einspline/bspline_cuda_s_impl.h
+++ b/src/einspline/bspline_cuda_s_impl.h
@ -0,0 +1,742 @@
+#ifndef BSPLINE_CUDA_S_IMPL_H
+#define BSPLINE_CUDA_S_IMPL_H
+
+//#include <stdio.h>
+#include "bspline.h"
+#include "bspline_create_cuda.h"
+
+__global__ static void
+eval_multi_UBspline_3d_s_kernel 
+(float *pos, float3 drInv, float *coefs, float *vals[], 
+ uint3 dim, uint2 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  //index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  //index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  //index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  if (off < N) {
+    float val = 0.0;
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+	for (int k=0; k<4; k++) 
+	  val += abc[16*i+4*j+k] * base[off+k*strides.z];
+      }
+    }
+    myval[off] = val;
+  }
+}
+
+
+
+__global__ static void
+eval_multi_UBspline_3d_s_sign_kernel 
+(float *pos, float *sign, float3 drInv, float *coefs, float *vals[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+  __shared__ float mysign;
+
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+    mysign = sign[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  //index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  //index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  //index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  if (off < N) {
+    float val = 0.0;
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+	for (int k=0; k<4; k++) 
+	  val += abc[16*i+4*j+k] * base[off+k*strides.z];
+      }
+    }
+    myval[off] = mysign*val;
+  }
+}
+
+
+
+
+__global__ static void
+eval_multi_UBspline_3d_s_vgh_kernel 
+(float *pos, float3 drInv,  float *coefs, 
+ float *vals[], float *grads[], float *hess[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad, *myhess;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	float *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  float c  = base[k*strides.z];
+	  v   += abc[n+0] * c;
+	  g0  += abc[n+64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+  abc[3*thr+0] = g0; 
+  abc[3*thr+1] = g1; 
+  abc[3*thr+2] = g2; 
+  __syncthreads();
+  for (int i=0; i<3; i++) {
+    int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 3*N)
+      mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr]; 
+  }
+  __syncthreads();
+
+  // Write Hessians
+  abc[6*thr+0]  = h00;
+  abc[6*thr+1]  = h01;
+  abc[6*thr+2]  = h02;
+  abc[6*thr+3]  = h11;
+  abc[6*thr+4]  = h12;
+  abc[6*thr+5]  = h22;
+  __syncthreads();
+  for (int i=0; i<6; i++) {
+    int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 6*N)
+      myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
+  }
+}
+
+
+extern "C" void
+eval_multi_UBspline_3d_s_cuda (UBspline_3d_s_cuda *spline,
+				     float *pos_d, float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+  
+
+  eval_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
+
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+}
+
+extern "C" void
+eval_multi_UBspline_3d_s_sign_cuda (UBspline_3d_s_cuda *spline,
+					  float *pos_d, float *sign_d, 
+					  float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+  
+
+  eval_multi_UBspline_3d_s_sign_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, sign_d, spline->gridInv, spline->coefs, 
+     vals_d, spline->dim, spline->stride, spline->num_splines);
+
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+}
+
+
+
+extern "C" void
+eval_multi_UBspline_3d_s_vgh_cuda (UBspline_3d_s_cuda *spline,
+					 float *pos_d, float *vals_d[], float *grads_d[],
+					 float *hess_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, hess_d,
+     spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_vgh_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+__global__ static void
+eval_multi_UBspline_3d_s_vgl_kernel 
+(float *pos, float3 drInv,  float *coefs,  float Linv[],
+ float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
+ int N, int row_stride)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad_lapl;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad_lapl = grad_lapl[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	float *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  float c  = base[k*strides.z];
+	  v   += abc[n+  0] * c;
+	  g0  += abc[n+ 64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+
+  __shared__ float G[3][3], GGt[3][3];
+  int i0 = threadIdx.x/3;
+  int i1 = threadIdx.x - 3*i0;
+  if (threadIdx.x < 9) 
+    G[i0][i1] = Linv[threadIdx.x];
+  __syncthreads();
+  if (threadIdx.x < 9)   
+    GGt[i0][i1] = (G[i0][0]*G[i1][0] + 
+		   G[i0][1]*G[i1][1] + 
+		   G[i0][2]*G[i1][2]);
+  __syncthreads();
+  if (off < N) {
+    // Store gradients back to global memory
+    mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
+    mygrad_lapl[off+1*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
+    mygrad_lapl[off+2*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
+    
+    // Store laplacians back to global memory
+    // Hessian = H00 H01 H02 H11 H12 H22
+    // Matrix = [0 1 2]
+    //          [1 3 4]
+    //          [2 4 5]
+    // laplacian = Trace(GGt*Hessian)
+    mygrad_lapl[off+3*row_stride] = 
+      (GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
+       GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
+       GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
+  }
+}
+
+
+extern "C" void
+eval_multi_UBspline_3d_s_vgl_cuda 
+(UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_UBspline_3d_s_vgl_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d, 
+     grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_vgl_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+__global__ static void
+eval_multi_UBspline_3d_s_vgl_sign_kernel 
+(float *pos, float sign[], float3 drInv,  float *coefs, float Linv[],
+   float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
+   int N, int row_stride)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad_lapl, mysign;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad_lapl = grad_lapl[ir];
+    mysign = sign[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	float *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  float c  = base[k*strides.z];
+	  v   += abc[n+  0] * c;
+	  g0  += abc[n+ 64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = mysign * v;
+  }
+
+  __shared__ float G[3][3], GGt[3][3];
+  int i0 = threadIdx.x/3;
+  int i1 = threadIdx.x - 3*i0;
+  if (threadIdx.x < 9) 
+    G[i0][i1] = Linv[threadIdx.x];
+  __syncthreads();
+  if (threadIdx.x < 9)   
+    GGt[i0][i1] = (G[i0][0]*G[i1][0] + 
+		   G[i0][1]*G[i1][1] + 
+		   G[i0][2]*G[i1][2]);
+  __syncthreads();
+  if (off < N) {
+    // Store gradients back to global memory
+    mygrad_lapl[off+0*row_stride] = mysign*(G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2);
+    mygrad_lapl[off+1*row_stride] = mysign*(G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2);
+    mygrad_lapl[off+2*row_stride] = mysign*(G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2);
+    
+    // Store laplacians back to global memory
+    // Hessian = H00 H01 H02 H11 H12 H22
+    // Matrix = [0 1 2]
+    //          [1 3 4]
+    //          [2 4 5]
+    // laplacian = Trace(GGt*Hessian)
+    mygrad_lapl[off+3*row_stride] = mysign * 
+      (GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
+       GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
+       GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
+  }
+}
+
+
+extern "C" void
+eval_multi_UBspline_3d_s_vgl_sign_cuda 
+(UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_UBspline_3d_s_vgl_sign_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, sign_d, spline->gridInv, spline->coefs, Linv_d, vals_d, 
+     grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_UBspline_3d_s_vgl_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+
+#endif
--- a/src/einspline/bspline_data.c
+++ b/src/einspline/bspline_data.c
@ -0,0 +1,207 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "config.h"
+
+/*****************
+/*   SSE Data    */
+/*****************/
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif 
+
+#define _XOPEN_SOURCE 600
+
+#ifndef __USE_XOPEN2K
+  #define __USE_XOPEN2K
+#endif
+#include <stdlib.h>
+
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
+
+// Single-precision version of matrices
+__m128 *restrict A_s = (__m128 *)0;
+// There is a problem with alignment of global variables in shared
+// libraries on 32-bit machines.
+// __m128  A0, A1, A2, A3, dA0, dA1, dA2, dA3, d2A0, d2A1, d2A2, d2A3;
+#endif
+
+#ifdef HAVE_SSE2
+// Double-precision version of matrices
+#include <emmintrin.h>
+__m128d *restrict A_d = (__m128d *)0;
+
+// There is a problem with alignment of global variables in shared
+// libraries on 32-bit machines.
+//__m128d A0_01, A0_23, A1_01, A1_23, A2_01, A2_23, A3_01, A3_23,
+//  dA0_01, dA0_23, dA1_01, dA1_23, dA2_01, dA2_23, dA3_01, dA3_23,
+//  d2A0_01, d2A0_23, d2A1_01, d2A1_23, d2A2_01, d2A2_23, d2A3_01, d2A3_23;
+#endif 
+
+void init_sse_data()
+{
+#ifdef HAVE_SSE
+  if (A_s == 0) {
+    posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12));
+    A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );
+    A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );	  
+    A_s[1]  = _mm_setr_ps ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0 );	  
+    A_s[2]  = _mm_setr_ps ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0 );	  
+    A_s[3]  = _mm_setr_ps ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0 );	  
+    A_s[4]  = _mm_setr_ps ( -0.5,  1.0, -0.5, 0.0  );		  
+    A_s[5]  = _mm_setr_ps (  0.0, -2.0,  1.5, 0.0  );		  
+    A_s[6]  = _mm_setr_ps (  0.5,  1.0, -1.5, 0.0  );		  
+    A_s[7]  = _mm_setr_ps (  0.0,  0.0,  0.5, 0.0  );		  
+    A_s[8]  = _mm_setr_ps (  1.0, -1.0,  0.0, 0.0  );		  
+    A_s[9]  = _mm_setr_ps ( -2.0,  3.0,  0.0, 0.0  );		  
+    A_s[10] = _mm_setr_ps (  1.0, -3.0,  0.0, 0.0  );		  
+    A_s[11] = _mm_setr_ps (  0.0,  1.0,  0.0, 0.0  );                  
+  }
+                 
+#endif
+#ifdef HAVE_SSE2
+  if (A_d == 0) {
+    posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*32));
+    A_d[ 0] = _mm_setr_pd (  3.0/6.0, -1.0/6.0 );	   
+    A_d[ 1] = _mm_setr_pd (  1.0/6.0, -3.0/6.0 );	   
+    A_d[ 2] = _mm_setr_pd ( -6.0/6.0,  3.0/6.0 );	   
+    A_d[ 3] = _mm_setr_pd (  4.0/6.0,  0.0/6.0 );	   
+    A_d[ 4] = _mm_setr_pd (  3.0/6.0, -3.0/6.0 );	   
+    A_d[ 5] = _mm_setr_pd (  1.0/6.0,  3.0/6.0 );	   
+    A_d[ 6] = _mm_setr_pd (  0.0/6.0,  1.0/6.0 );	   
+    A_d[ 7] = _mm_setr_pd (  0.0/6.0,  0.0/6.0 );	   
+    A_d[ 8] = _mm_setr_pd ( -0.5,  0.0 );		   
+    A_d[ 9] = _mm_setr_pd ( -0.5,  1.0 );		   
+    A_d[10] = _mm_setr_pd (  1.5,  0.0 );		   
+    A_d[11] = _mm_setr_pd (  0.0, -2.0 );		   
+    A_d[12] = _mm_setr_pd ( -1.5,  0.0 );		   
+    A_d[13] = _mm_setr_pd (  0.5,  1.0 );		   
+    A_d[14] = _mm_setr_pd (  0.5,  0.0 );		   
+    A_d[15] = _mm_setr_pd (  0.0,  0.0 );		   
+    A_d[16] = _mm_setr_pd (  0.0,  0.0 );		   
+    A_d[17] = _mm_setr_pd (  1.0, -1.0 );		   
+    A_d[18] = _mm_setr_pd (  0.0,  0.0 );		   
+    A_d[19] = _mm_setr_pd ( -2.0,  3.0 );		   
+    A_d[20] = _mm_setr_pd (  0.0,  0.0 );		   
+    A_d[21] = _mm_setr_pd (  1.0, -3.0 );		   
+    A_d[22] = _mm_setr_pd (  0.0,  0.0 );		   
+    A_d[23] = _mm_setr_pd (  0.0,  1.0 );   
+    A_d[25] = _mm_setr_pd ( -1.0,  0.0 );       
+    A_d[26] = _mm_setr_pd (  0.0,  0.0 );       
+    A_d[27] = _mm_setr_pd (  3.0,  0.0 );       
+    A_d[28] = _mm_setr_pd (  0.0,  0.0 );       
+    A_d[29] = _mm_setr_pd ( -3.0,  0.0 );       
+    A_d[30] = _mm_setr_pd (  0.0,  0.0 );       
+    A_d[31] = _mm_setr_pd (  1.0,  0.0 );
+  }                
+#endif
+}
+
+
+#ifdef USE_ALTIVEC
+vector float A0   = (vector float) ( -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0);
+vector float A1   = (vector float) (  3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0);
+vector float A2   = (vector float) ( -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0);
+vector float A3   = (vector float) (  1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0);
+/* vector float A0   = (vector float) ( -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0); */
+/* vector float A1   = (vector float) (  3.0/6.0, -6.0/6.0,  3.0/6.0, 0.0/6.0); */
+/* vector float A2   = (vector float) ( -3.0/6.0,  0.0/6.0,  3.0/6.0, 0.0/6.0); */
+/* vector float A3   = (vector float) (  1.0/6.0,  4.0/6.0,  1.0/6.0, 0.0/6.0); */
+/* vector float A0   = (vector float) ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0); */
+/* vector float A1   = (vector float) ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0); */
+/* vector float A2   = (vector float) ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0); */
+/* vector float A3   = (vector float) ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0); */
+vector float dA0  = (vector float) ( 0.0, -0.5,  1.0, -0.5 );
+vector float dA1  = (vector float) ( 0.0,  1.5, -2.0,  0.0 );
+vector float dA2  = (vector float) ( 0.0, -1.5,  1.0,  0.5 );
+vector float dA3  = (vector float) ( 0.0,  0.5,  0.0,  0.0 );
+vector float d2A0 = (vector float) ( 0.0,  0.0, -1.0,  1.0 );
+vector float d2A1 = (vector float) ( 0.0,  0.0,  3.0, -2.0 );
+vector float d2A2 = (vector float) ( 0.0,  0.0, -3.0,  1.0 );
+vector float d2A3 = (vector float) ( 0.0,  0.0,  1.0,  0.0 );
+#endif
+
+/*****************/
+/* Standard Data */
+/*****************/
+
+//////////////////////
+// Single precision //
+//////////////////////
+const float A44f[16] = 
+  { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0 };
+const float* restrict Af = A44f;
+
+const float dA44f[16] =
+  {  0.0, -0.5,  1.0, -0.5,
+     0.0,  1.5, -2.0,  0.0,
+     0.0, -1.5,  1.0,  0.5,
+     0.0,  0.5,  0.0,  0.0 };
+const float* restrict dAf = dA44f;
+
+const float d2A44f[16] = 
+  {  0.0, 0.0, -1.0,  1.0,
+     0.0, 0.0,  3.0, -2.0,
+     0.0, 0.0, -3.0,  1.0,
+     0.0, 0.0,  1.0,  0.0 };
+const float* restrict d2Af = d2A44f;
+
+const float d3A44f[16] =
+  {  0.0, 0.0,  0.0, -1.0,
+     0.0, 0.0,  0.0,  3.0,
+     0.0, 0.0,  0.0, -3.0,
+     0.0, 0.0,  0.0,  1.0};
+const float* restrict d3Af = d3A44f;
+
+//////////////////////
+// Double precision //
+//////////////////////
+const double A44d[16] = 
+  { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0 };
+const double* restrict Ad = A44d;
+
+const double dA44d[16] =
+  {  0.0, -0.5,  1.0, -0.5,
+     0.0,  1.5, -2.0,  0.0,
+     0.0, -1.5,  1.0,  0.5,
+     0.0,  0.5,  0.0,  0.0 };
+const double* restrict dAd = dA44d;
+
+const double d2A44d[16] = 
+  {  0.0, 0.0, -1.0,  1.0,
+     0.0, 0.0,  3.0, -2.0,
+     0.0, 0.0, -3.0,  1.0,
+     0.0, 0.0,  1.0,  0.0 };
+const double* restrict d2Ad = d2A44d;
+
+const double d3A44d[16] =
+  {  0.0, 0.0,  0.0, -1.0,
+     0.0, 0.0,  0.0,  3.0,
+     0.0, 0.0,  0.0, -3.0,
+     0.0, 0.0,  0.0,  1.0};
+const double* restrict d3Ad = d3A44d;
--- a/src/einspline/bspline_eval_altivec_s.h
+++ b/src/einspline/bspline_eval_altivec_s.h
@ -0,0 +1,498 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_EVAL_SSE_S_H
+#define BSPLINE_EVAL_SSE_S_H
+
+#include <stdio.h>
+#include <math.h>
+#include <ppc_intrinsics.h>
+
+extern vector float  A0,   A1,   A2,   A3;
+extern vector float  dA0,  dA1,  dA2,  dA3;
+extern vector float d2A0, d2A1, d2A2, d2A3;
+
+extern const float* restrict   Af;
+extern const float* restrict  dAf;
+extern const float* restrict d2Af;
+
+inline vector float
+MakeVec (double a, double b, double c, double d)
+{
+  union
+  {
+    float scalars[vec_step(vector float)];
+    vector float v;
+  } buffer;
+  buffer.scalars[0] = a;
+  buffer.scalars[1] = b;
+  buffer.scalars[2] = c;
+  buffer.scalars[3] = d;
+  return buffer.v;
+}
+
+void
+GetVec (vector unsigned int i, int *i0, int *i1, int *i2, int *i3)
+{
+  union
+  {
+    unsigned int scalars[vec_step(vector float)];
+    vector unsigned int v;
+  } buffer;
+  buffer.v = i;
+  *i0 = buffer.scalars[0];
+  *i1 = buffer.scalars[1];
+  *i2 = buffer.scalars[2];
+  *i3 = buffer.scalars[3];
+}
+
+vector unsigned char perm0 = (vector unsigned char) 
+  ( 0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27 );
+vector unsigned char perm1 = (vector unsigned char) 
+  (4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31 );
+vector unsigned char perm2 = (vector unsigned char) 
+( 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 );
+vector unsigned char perm3 = (vector unsigned char) 
+( 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 );
+vector float zero = (vector float) (0.0, 0.0, 0.0, 0.0);
+
+inline
+vector float LoadUnaligned(float *target )
+{
+  vector float MSQ, LSQ, result;
+  vector unsigned char mask;
+  MSQ = vec_ld(0, target);          // most significant quadword
+  LSQ = vec_ld(15, target);         // least significant quadword
+  mask = vec_lvsl(0, target);       // create the permute mask
+  result =  vec_perm(MSQ, LSQ, mask);  // align the data
+  //  fprintf (stderr, "result = %vf\n", result);
+  //   fprintf (stderr, "target = %f %f %f %f\n", target[0], target[1], target[2], target[3]);
+  return result;
+}
+
+
+
+/// SSE3 add "horizontal add" instructions, which makes things
+/// simpler and faster
+// Use plain-old SSE instructions
+#define _TRANSPOSE4(_v0, _v1, _v2, _v3)           \
+do {                                              \
+  vector float _t0 = vec_perm (_v0, _v1, perm0);  \
+  vector float _t1 = vec_perm (_v0, _v1, perm1);  \
+  vector float _t2 = vec_perm (_v2, _v3, perm0);  \
+  vector float _t3 = vec_perm (_v2, _v3, perm1);  \
+  _v0 = vec_perm (_t0, _t2, perm2);               \
+  _v1 = vec_perm (_t1, _t3, perm2);               \
+  _v2 = vec_perm (_t0, _t2, perm3);               \
+  _v3 = vec_perm (_t1, _t3, perm3);               \
+} while (0);
+
+#define _MM_MATVEC4_PS(M0, M1, M2, M3, v, r)                        \
+do {                                                                \
+  vector float r0 = vec_madd (M0, v, zero);                         \
+  vector float r1 = vec_madd (M1, v, zero);              	    \
+  vector float r2 = vec_madd (M2, v, zero);                         \
+  vector float r3 = vec_madd (M3, v, zero);		            \
+  _TRANSPOSE4 (r0, r1, r2, r3);                                     \
+  r = vec_add (vec_add(r0, r1), vec_add(r2, r3));                   \
+ } while (0);
+#define _MM_DOT4_PS(A, B, p)                                        \
+do {                                                                \
+  vector float _t    = vec_madd (A, B, zero);                       \
+  vector float _alo  = vec_mergel (_t, _t);                         \
+  vector float _ahi  = vec_mergeh (_t, _t);                         \
+  vector float _a    = vec_add (_alo, _ahi);                        \
+  vector float _rlo  = vec_mergel (_a, _a);                         \
+  vector float _rhi  = vec_mergeh (_a, _a);                         \
+  vector float _r    = vec_add (_rlo, _rhi);                        \
+  vector float _r2   = vec_splat (_r, 0);                           \
+  vec_ste (_r2, 0, (p));                                            \
+} while(0);
+
+#define _4DOTS(_u0, _v0, _u1, _v1, _u2, _v2, _u3, _v3, result)      \
+do {                                                                \
+  vector float _w0   = vec_madd (_u0, _v0, zero);                   \
+  vector float _w1   = vec_madd (_u1, _v1, zero);                   \
+  vector float _w2   = vec_madd (_u2, _v2, zero);                   \
+  vector float _w3   = vec_madd (_u3, _v3, zero);                   \
+  _TRANSPOSE4 (_w0, _w1, _w2, _w3);                                 \
+  result = vec_add (vec_add(_w0, _w1), vec_add(_w2, _w3));         \
+} while(0);
+
+
+
+/************************************************************/
+/* 1D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_1d_s (UBspline_1d_s * restrict spline, 
+		    double x, float* restrict val)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+}
+
+/* Value and first derivative */
+inline void
+eval_UBspline_1d_s_vg (UBspline_1d_s * restrict spline, double x, 
+		     float* restrict val, float* restrict grad)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
+     coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
+     coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
+     coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
+}
+/* Value, first derivative, and second derivative */
+inline void
+eval_UBspline_1d_s_vgl (UBspline_1d_s * restrict spline, double x, 
+			float* restrict val, float* restrict grad,
+			float* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+
+  float* restrict coefs = spline->coefs;
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
+     coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
+     coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
+     coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
+  *lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv * 
+    (coefs[i+0]*(d2Af[ 2]*tp[2] + d2Af[ 3]*tp[3])+
+     coefs[i+1]*(d2Af[ 6]*tp[2] + d2Af[ 7]*tp[3])+
+     coefs[i+2]*(d2Af[10]*tp[2] + d2Af[11]*tp[3])+
+     coefs[i+3]*(d2Af[14]*tp[2] + d2Af[15]*tp[3]));
+}
+
+/************************************************************/
+/* 2D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_2d_s (UBspline_2d_s * restrict spline, 
+		    double x, double y, float* restrict val)
+{
+}
+
+
+/* Value and gradient */
+inline void
+eval_UBspline_2d_s_vg (UBspline_2d_s * restrict spline, 
+		       double x, double y, 
+		       float* restrict val, float* restrict grad)
+{
+}
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_2d_s_vgl (UBspline_2d_s * restrict spline, 
+			double x, double y, float* restrict val, 
+			float* restrict grad, float* restrict lapl)
+{
+}
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_2d_s_vgh (UBspline_2d_s * restrict spline, 
+			double x, double y, float* restrict val, 
+			float* restrict grad, float* restrict hess)
+{
+
+}
+
+
+
+/************************************************************/
+/* 3D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_3d_s (UBspline_3d_s * restrict spline, 
+		    double x, double y, double z,
+		    float* restrict val)
+{
+}
+
+/* Value and gradient */
+inline void
+eval_UBspline_3d_s_vg (UBspline_3d_s * restrict spline, 
+			double x, double y, double z,
+			float* restrict val, float* restrict grad)
+{
+}
+
+
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_3d_s_vgl (UBspline_3d_s * restrict spline, 
+			double x, double y, double z,
+			float* restrict val, float* restrict grad, float* restrict lapl)
+{
+
+}
+
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_3d_s_vgh (UBspline_3d_s * restrict spline, 
+			double x, double y, double z,
+			float* restrict val, float* restrict grad, 
+			float* restrict hess)
+{
+  vec_dst (&A0, (12<<3) | (1<<8), 0);
+  /// SSE mesh point determination
+  vector float xyz       = MakeVec (x, y, z, 0.0);
+  vector float x0y0z0    = MakeVec ( spline->x_grid.start,  spline->y_grid.start, 
+				     spline->z_grid.start, 0.0);
+  vector float delta_inv = MakeVec( spline->x_grid.delta_inv,
+				    spline->y_grid.delta_inv, 
+					    spline->z_grid.delta_inv, 0.0 );
+  xyz = vec_sub (xyz, x0y0z0);
+  // ux = (x - x0)/delta_x and same for y and z
+  vector float uxuyuz  = vec_madd (xyz, delta_inv, zero);
+  //  fprintf (stderr, "uxuyuz = %vf\n", uxuyuz);
+  // intpart = trunc (ux, uy, uz)
+  vector float intpart  = vec_floor (uxuyuz);
+  // fprintf (stderr, "intpart = %vf\n", intpart);
+  vector unsigned int ixiyiz     = vec_ctu   (intpart, 0);
+  // Store to memory for use in C expressions
+  // xmm registers are stored to memory in reverse order
+  int ix, iy, iz, dummy;
+  //fprintf (stderr, "ixiyiz = %vld\n", ixiyiz);
+  GetVec (ixiyiz, &ix, &iy, &iz, &dummy);
+  // fprintf (stderr, "ix = %d  iy = %d  iz = %d\n", ix, iy, iz);
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+
+  // This macro is used to give the pointer to coefficient data.
+  // i and j should be in the range [0,3].  Coefficients are read four
+  // at a time, so no k value is needed.
+#define P(i,j) ((float*)spline->coefs+(ix+(i))*xs+(iy+(j))*ys+(iz))
+  // Prefetch the data from main memory into cache so it's available
+  // when we need to use it.
+  int control_word;
+  control_word = (2<<3) | (4<<8) | ((4*ys) << 16);
+//   fprintf (stderr, "control word = %x\n", control_word);
+//   fprintf (stderr, "ys = %d P(0,1)-P(0,0) = %d\n", ys,
+//   P(0,1)-P(0,0));
+  void *ptr = P(0,0);
+  __dcbt (P(0,0), 0);  __dcbt (P(0,1), 0); __dcbt (P(0,2), 0); __dcbt (P(0,3), 0);  
+  __dcbt (P(0,0), 12); __dcbt (P(0,1),12); __dcbt (P(0,2),12); __dcbt (P(0,3),12);  
+  __dcbt (P(1,0), 0);  __dcbt (P(1,1), 0); __dcbt (P(1,2), 0); __dcbt (P(1,3), 0);  
+  __dcbt (P(1,0), 12); __dcbt (P(1,1),12); __dcbt (P(1,2),12); __dcbt (P(1,3),12);  
+  __dcbt (P(2,0), 0);  __dcbt (P(2,1), 0); __dcbt (P(2,2), 0); __dcbt (P(2,3), 0);  
+  __dcbt (P(2,0), 12); __dcbt (P(2,1),12); __dcbt (P(2,2),12); __dcbt (P(2,3),12);  
+  __dcbt (P(3,0), 0);  __dcbt (P(3,1), 0); __dcbt (P(3,2), 0); __dcbt (P(3,3), 0);  
+  __dcbt (P(3,0), 12); __dcbt (P(3,1),12); __dcbt (P(3,2),12); __dcbt (P(3,3),12);  
+//   vec_dstt (P(0,0), control_word, 0);
+//   vec_dstt (P(1,0), control_word, 1);
+//   vec_dstt (P(2,0), control_word, 2);
+//   vec_dstt (P(3,0), control_word, 3);
+
+//   // Now compute the vectors:
+//   // tpx = [t_x^3 t_x^2 t_x 1]
+//   // tpy = [t_y^3 t_y^2 t_y 1]
+//   // tpz = [t_z^3 t_z^2 t_z 1]
+  vector float txtytz = vec_sub (uxuyuz, intpart);
+  vector float one    = (vector float) ( 1.0, 1.0, 1.0, 0.0);
+  vector float t2     = vec_madd (txtytz, txtytz, zero);
+  vector float t3     = vec_madd (t2, txtytz, zero);
+//   vector float tpx    = t3;
+//   vector float tpy    = t2;
+//   vector float tpz    = txtytz;
+//   vector float z2     = one;
+//   _TRANSPOSE4(z2, tpz, tpy, tpx);
+  vector float tpx    = t3;
+  vector float tpy    = t2;
+  vector float tpz    = txtytz;
+  vector float z2     = one;
+  _TRANSPOSE4(tpx, tpy, tpz, z2);
+//   fprintf (stderr, "txtytz = %vf\n", txtytz);
+//   fprintf (stderr, "tpxyz %vf   %vf   %vf\n", tpx, tpy, tpz);
+//   fprintf (stderr, "ix,iy,iz = %d, %d, %d\n", ix, iy, iz);
+  // a  =  A * tpx,   b =  A * tpy,   c =  A * tpz
+  // da = dA * tpx,  db = dA * tpy,  dc = dA * tpz, etc.
+  // A is 4x4 matrix given by the rows A0, A1, A2, A3
+  vector float a, b, c, da, db, dc, d2a, d2b, d2c,
+    cP[4], dcP[4], d2cP[4], bcP, dbcP, bdcP, d2bcP, dbdcP, bd2cP,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    // x-dependent vectors
+  _MM_MATVEC4_PS (  A0,   A1,   A2,   A3, tpx,   a);
+  _MM_MATVEC4_PS ( dA0,  dA1,  dA2,  dA3, tpx,  da);
+  _MM_MATVEC4_PS (d2A0, d2A1, d2A2, d2A3, tpx, d2a);
+  // y-dependent vectors
+  _MM_MATVEC4_PS (  A0,   A1,   A2,   A3, tpy,   b);
+  _MM_MATVEC4_PS ( dA0,  dA1,  dA2,  dA3, tpy,  db);
+  _MM_MATVEC4_PS (d2A0, d2A1, d2A2, d2A3, tpy, d2b);
+  // z-dependent vectors
+  _MM_MATVEC4_PS (  A0,   A1,   A2,   A3, tpz,   c);
+  _MM_MATVEC4_PS ( dA0,  dA1,  dA2,  dA3, tpz,  dc);
+  _MM_MATVEC4_PS (d2A0, d2A1, d2A2, d2A3, tpz, d2c);
+
+//   fprintf (stderr, "a = %vf\n", a);
+//   fprintf (stderr, "b = %vf\n", b);
+//   fprintf (stderr, "c = %vf\n", c);
+
+  // Compute cP, dcP, and d2cP products 1/4 at a time to maximize
+  // register reuse and avoid rerereading from memory or cache.
+  // 1st quarter
+  tmp0 = LoadUnaligned (P(0,0));
+  tmp1 = LoadUnaligned (P(0,1));
+  tmp2 = LoadUnaligned (P(0,2));
+  tmp3 = LoadUnaligned (P(0,3));
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,   c,   cP[0]);
+  //  fprintf (stderr, "cP[0] = %vf\n", cP[0]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,  dc,  dcP[0]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[0]);
+  // 2nd quarter
+  tmp0 = LoadUnaligned (P(1,0));
+  tmp1 = LoadUnaligned (P(1,1));
+  tmp2 = LoadUnaligned (P(1,2));
+  tmp3 = LoadUnaligned (P(1,3));
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,   c,   cP[1]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,  dc,  dcP[1]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[1]);
+  // 3rd quarter
+  tmp0 = LoadUnaligned (P(2,0));
+  tmp1 = LoadUnaligned (P(2,1));
+  tmp2 = LoadUnaligned (P(2,2));
+  tmp3 = LoadUnaligned (P(2,3));
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,   c,   cP[2]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,  dc,  dcP[2]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[2]);
+  // 4th quarter
+  tmp0 = LoadUnaligned (P(3,0));
+  tmp1 = LoadUnaligned (P(3,1));
+  tmp2 = LoadUnaligned (P(3,2));
+  tmp3 = LoadUnaligned (P(3,3));
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,   c,   cP[3]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3,  dc,  dcP[3]);
+  _MM_MATVEC4_PS (tmp0, tmp1, tmp2, tmp3, d2c, d2cP[3]);
+  
+  // Now compute bcP, dbcP, bdcP, d2bcP, bd2cP, and dbdc products
+  _MM_MATVEC4_PS (  cP[0],   cP[1],   cP[2],   cP[3],   b,   bcP);
+  _MM_MATVEC4_PS (  cP[0],   cP[1],   cP[2],   cP[3],  db,  dbcP);
+  _MM_MATVEC4_PS ( dcP[0],  dcP[1],  dcP[2],  dcP[3],   b,  bdcP);
+  _MM_MATVEC4_PS (  cP[0],   cP[1],   cP[2],   cP[3], d2b, d2bcP);
+  _MM_MATVEC4_PS (d2cP[0], d2cP[1], d2cP[2], d2cP[3],   b, bd2cP);
+  _MM_MATVEC4_PS ( dcP[0],  dcP[1],  dcP[2],  dcP[3],  db, dbdcP);
+
+  vector float valgrad, hess4;
+//   fprintf (stderr, "a = %vf\n", a);
+//   fprintf (stderr, "bcP = %vf\n", bcP);
+  _4DOTS (a, bcP, da, bcP, a, dbcP, a, bdcP, valgrad);
+//   fprintf (stderr, "valgrad = %vf\n", valgrad);
+  tmp0 = vec_splat (valgrad, 0);  vec_ste (tmp0, 0, val);
+  tmp0 = vec_splat (valgrad, 1);  vec_ste (tmp0, 0, &(grad[0]));
+  tmp0 = vec_splat (valgrad, 2);  vec_ste (tmp0, 0, &(grad[1]));
+  tmp0 = vec_splat (valgrad, 3);  vec_ste (tmp0, 0, &(grad[2]));
+  _4DOTS (d2a, bcP, a, d2bcP, a, bd2cP, da, dbcP, hess4);
+  tmp0 = vec_splat (hess4, 0);  vec_ste (tmp0, 0, &(hess[0]));
+  tmp0 = vec_splat (hess4, 1);  vec_ste (tmp0, 0, &(hess[4]));
+  tmp0 = vec_splat (hess4, 2);  vec_ste (tmp0, 0, &(hess[8]));
+  tmp0 = vec_splat (hess4, 3);  vec_ste (tmp0, 0, &(hess[1]));
+  _4DOTS (da, bdcP, a, dbdcP, a, a, a, a, hess4);
+  tmp0 = vec_splat (hess4, 0);  vec_ste (tmp0, 0, &(hess[2]));
+  tmp0 = vec_splat (hess4, 1);  vec_ste (tmp0, 0, &(hess[5]));
+  // Compute value
+//   _MM_DOT4_PS (a, bcP, val);
+//     // Compute gradient
+//   _MM_DOT4_PS (da, bcP, &(grad[0]));
+//   _MM_DOT4_PS (a, dbcP, &(grad[1]));
+//   _MM_DOT4_PS (a, bdcP, &(grad[2]));
+//   // Compute hessian
+//   _MM_DOT4_PS (d2a, bcP, &(hess[0]));
+//   _MM_DOT4_PS (a, d2bcP, &(hess[4]));
+//   _MM_DOT4_PS (a, bd2cP, &(hess[8]));
+//   _MM_DOT4_PS (da, dbcP, &(hess[1]));
+//   _MM_DOT4_PS (da, bdcP, &(hess[2]));
+//   _MM_DOT4_PS (a, dbdcP, &(hess[5]));
+
+  // Multiply gradients and hessians by appropriate grid inverses
+  float dxInv = spline->x_grid.delta_inv;
+  float dyInv = spline->y_grid.delta_inv;
+  float dzInv = spline->z_grid.delta_inv;
+  grad[0] *= dxInv;
+  grad[1] *= dyInv;
+  grad[2] *= dzInv;
+  hess[0] *= dxInv*dxInv;
+  hess[4] *= dyInv*dyInv;
+  hess[8] *= dzInv*dzInv;
+  hess[1] *= dxInv*dyInv;
+  hess[2] *= dxInv*dzInv;
+  hess[5] *= dyInv*dzInv;
+  // Copy hessian elements into lower half of 3x3 matrix
+  hess[3] = hess[1];
+  hess[6] = hess[2];
+  hess[7] = hess[5];
+#undef P
+  //fprintf (stderr, "%vf\n", xyz);
+}
+
+#undef _MM_MATVEC4_PS
+#undef _MM_DOT4_PS
+
+#endif
--- a/src/einspline/bspline_eval_cuda.h
+++ b/src/einspline/bspline_eval_cuda.h
@ -0,0 +1,32 @@
+#ifndef BSPLINE_EVAL_CUDA_H
+#define BSPLINE_EVAL_CUDA_H
+
+#include "bspline_structs_cuda.h"
+
+extern "C" void
+eval_multi_UBspline_3d_s_cuda (UBspline_3d_s_cuda *spline,
+			       float *pos_d, float *vals_d[], int num);
+
+extern "C" void
+eval_multi_UBspline_3d_s_sign_cuda (UBspline_3d_s_cuda *spline,
+				    float *pos_d, float *sign_d, 
+				    float *vals_d[], int num);
+
+extern "C" void
+eval_multi_UBspline_3d_s_vgh_cuda (UBspline_3d_s_cuda *spline,
+				   float *pos_d, float *vals_d[], float *grads_d[],
+				   float *hess_d[], int num);
+
+extern "C" void
+eval_multi_UBspline_3d_s_vgl_cuda 
+(UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
+
+extern "C" void
+eval_multi_UBspline_3d_s_vgl_sign_cuda 
+(UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
+
+
+
+#endif
--- a/src/einspline/bspline_eval_sse_c.h
+++ b/src/einspline/bspline_eval_sse_c.h
--- a/src/einspline/bspline_eval_sse_d.h
+++ b/src/einspline/bspline_eval_sse_d.h
--- a/src/einspline/bspline_eval_sse_s.h
+++ b/src/einspline/bspline_eval_sse_s.h
--- a/src/einspline/bspline_eval_sse_z.h
+++ b/src/einspline/bspline_eval_sse_z.h
--- a/src/einspline/bspline_eval_std_c.h
+++ b/src/einspline/bspline_eval_std_c.h
@ -0,0 +1,950 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_EVAL_STD_C_H
+#define BSPLINE_EVAL_STD_C_H
+
+#include <math.h>
+#include <stdio.h>
+
+extern const float* restrict   Af;
+extern const float* restrict  dAf;
+extern const float* restrict d2Af;
+
+/************************************************************/
+/* 1D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_1d_c (UBspline_1d_c * restrict spline, 
+		    double x, complex_float* restrict val)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+}
+
+/* Value and first derivative */
+inline void
+eval_UBspline_1d_c_vg (UBspline_1d_c * restrict spline, double x, 
+		       complex_float* restrict val, complex_float* restrict grad)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  float dxInv = spline->x_grid.delta_inv;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+  *grad = dxInv * 
+    (coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
+     coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
+     coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
+     coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
+}
+/* Value, first derivative, and second derivative */
+inline void
+eval_UBspline_1d_c_vgl (UBspline_1d_c * restrict spline, double x, 
+			complex_float* restrict val, complex_float* restrict grad,
+			complex_float* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  float dxInv = spline->x_grid.delta_inv;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+  *grad = dxInv * 
+    (coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
+     coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
+     coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
+     coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
+  *lapl = dxInv * dxInv * 
+    (coefs[i+0]*(d2Af[ 2]*tp[2] + d2Af[ 3]*tp[3])+
+     coefs[i+1]*(d2Af[ 6]*tp[2] + d2Af[ 7]*tp[3])+
+     coefs[i+2]*(d2Af[10]*tp[2] + d2Af[11]*tp[3])+
+     coefs[i+3]*(d2Af[14]*tp[2] + d2Af[15]*tp[3]));
+}
+inline void
+eval_UBspline_1d_c_vgh (UBspline_1d_c * restrict spline, double x, 
+			complex_float* restrict val, 
+			complex_float* restrict grad,
+			complex_float* restrict hess)
+{
+  eval_UBspline_1d_c_vgl (spline, x, val, grad, hess);
+}
+
+/************************************************************/
+/* 2D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_2d_c (UBspline_2d_c * restrict spline, 
+		    double x, double y, complex_float* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
+
+  b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+	  a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+	  a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+	  a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+#undef C
+
+}
+
+
+/* Value and gradient */
+inline void
+eval_UBspline_2d_c_vg (UBspline_2d_c * restrict spline, 
+		       double x, double y, 
+		       complex_float* restrict val, complex_float* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4], da[4], db[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0]  = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]  = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]  = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]  = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0] = (dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1] = (dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2] = (dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3] = (dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+
+  b[0]  = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+  float dxInv = spline->x_grid.delta_inv;
+  float dyInv = spline->y_grid.delta_inv;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = dxInv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = dyInv * 
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+#undef C
+
+}
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_2d_c_vgl (UBspline_2d_c * restrict spline, 
+			double x, double y, complex_float* restrict val, 
+			complex_float* restrict grad, complex_float* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+
+  float dxInv = spline->x_grid.delta_inv;
+  float dyInv = spline->y_grid.delta_inv;
+
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = dxInv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = dyInv*
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+  *lapl   = 
+    dyInv * dyInv *
+    (a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+      a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+      a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+     a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) + 
+    dxInv * dxInv *
+     (d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+      d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+      d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+      d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  
+#undef C
+
+}
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_2d_c_vgh (UBspline_2d_c * restrict spline, 
+			double x, double y, complex_float* restrict val, 
+			complex_float* restrict grad, complex_float* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]   = (  Af[ 0]*tpy[0] +   Af[ 1]*tpy[1] +  Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]   = (  Af[ 4]*tpy[0] +   Af[ 5]*tpy[1] +  Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]   = (  Af[ 8]*tpy[0] +   Af[ 9]*tpy[1] +  Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]   = (  Af[12]*tpy[0] +   Af[13]*tpy[1] +  Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0]  = ( dAf[ 1]*tpy[1] +  dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1]  = ( dAf[ 5]*tpy[1] +  dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2]  = ( dAf[ 9]*tpy[1] +  dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3]  = ( dAf[13]*tpy[1] +  dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+  float dxInv = spline->x_grid.delta_inv;
+  float dyInv = spline->y_grid.delta_inv;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (  a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+       a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+       a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+       a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[0] = dxInv * 
+    ( da[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+      da[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+      da[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+      da[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[1] = dyInv *
+    (  a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+       a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+       a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+       a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[0] = dxInv * dxInv *
+    (d2a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+     d2a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+     d2a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+     d2a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  hess[1] = dxInv * dyInv * 
+    ( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+      da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+      da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+      da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[3] = dyInv * dyInv * 
+    (  a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+       a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+       a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+       a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
+  hess[2] = hess[1];
+  
+#undef C
+
+}
+
+
+/************************************************************/
+/* 3D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_3d_c (UBspline_3d_c * restrict spline, 
+		    double x, double y, double z,
+		    complex_float* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modff (uy, &iparty);  int iy = (int) iparty;
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz;
+
+
+
+  
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
+
+  b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+
+  c[0] = (Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1] = (Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2] = (Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3] = (Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  *val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
+		b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
+		b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
+		b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
+	  a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
+		b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
+		b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
+		b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
+	  a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
+		b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
+		b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
+		b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
+	  a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
+		b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
+		b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
+		b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
+#undef P
+
+}
+
+/* Value and gradient */
+inline void
+eval_UBspline_3d_c_vg (UBspline_3d_c * restrict spline, 
+			double x, double y, double z,
+			complex_float* restrict val, complex_float* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modff (uy, &iparty);  int iy = (int) iparty; 
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4];
+  complex_float cP[16], bcP[4], dbcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+
+  c[0]  = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1]  = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2]  = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3]  = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
+  dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
+  dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
+  dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+
+  float dxInv = spline->x_grid.delta_inv;
+  float dyInv = spline->y_grid.delta_inv;
+  float dzInv = spline->z_grid.delta_inv;
+
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  *val    = ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+  grad[0] = dxInv * 
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = dyInv *
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = dzInv * 
+    (a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
+	   b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
+	   b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
+	   b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
+     a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
+	   b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
+	   b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
+	   b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
+     a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
+	   b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
+	   b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
+	   b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
+     a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
+	   b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
+	   b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
+	   b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
+#undef P
+
+}
+
+
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_3d_c_vgl (UBspline_3d_c * restrict spline, 
+			double x, double y, double z,
+			complex_float* restrict val, complex_float* restrict grad, 
+			complex_float* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modff (uy, &iparty);  int iy = (int) iparty; 
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4];
+  complex_float cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+
+  c[0]  = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1]  = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2]  = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3]  = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
+  dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
+  dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
+  dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
+  d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
+  d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
+  d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
+  d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+
+  float dxInv = spline->x_grid.delta_inv;
+  float dyInv = spline->y_grid.delta_inv;
+  float dzInv = spline->z_grid.delta_inv;
+
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+
+
+  *val    = 
+    ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+
+  grad[0] = dxInv *
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = dyInv * 
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = dzInv * 
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+
+  *lapl = 
+    dxInv * dxInv * 
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+    
+    + dyInv * dyInv * 
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+    
+    + dzInv * dzInv * 
+    (a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+    
+	   b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
+	   b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
+	   b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
+     a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
+	   b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
+	   b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
+	   b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
+     a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
+	   b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
+	   b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
+	   b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
+     a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
+	   b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
+	   b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
+	   b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
+#undef P
+
+}
+
+
+
+
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_3d_c_vgh (UBspline_3d_c * restrict spline, 
+			double x, double y, double z,
+			complex_float* restrict val, complex_float* restrict grad, 
+			complex_float* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;  
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
+  uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
+  uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modff (uy, &iparty);  int iy = (int) iparty;
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz;
+
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4];
+  complex_float cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
+    d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+
+  c[0]  = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1]  = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2]  = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3]  = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
+  dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
+  dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
+  dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
+  d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
+  d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
+  d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
+  d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+  int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
+
+  float dxInv = spline->x_grid.delta_inv;
+  float dyInv = spline->y_grid.delta_inv;
+  float dzInv = spline->z_grid.delta_inv;
+
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
+  d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
+  d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
+  d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
+  d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
+  d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
+  d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
+  d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
+  d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
+  d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
+  d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
+  d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
+  d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
+  d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
+  d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
+  d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
+  bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
+  bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
+  bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+  
+  dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
+  dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
+  dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
+  dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
+
+  *val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
+  grad[0] = dxInv *
+    (da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = dyInv *
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = dzInv *
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+  // d2x
+  hess[0] = dxInv * dxInv *
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
+  // dx dy
+  hess[1] = dxInv * dyInv *
+    (da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
+  hess[3] = hess[1];
+  // dx dz;
+  hess[2] = dxInv * dzInv *
+    (da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
+  hess[6] = hess[2];
+  // d2y
+  hess[4] = dyInv * dyInv *
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
+  // dy dz
+  hess[5] = dyInv * dzInv *
+    (a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
+  hess[7] = hess[5];
+  // d2z
+  hess[8] = dzInv * dzInv *
+    (a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
+#undef P
+
+}
+
+#endif
--- a/src/einspline/bspline_eval_std_c_ansi.h
+++ b/src/einspline/bspline_eval_std_c_ansi.h
--- a/src/einspline/bspline_eval_std_d.h
+++ b/src/einspline/bspline_eval_std_d.h
@ -0,0 +1,932 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_EVAL_STD_D_H
+#define BSPLINE_EVAL_STD_D_H
+
+#include <math.h>
+#include <stdio.h>
+
+extern const double* restrict   Ad;
+extern const double* restrict  dAd;
+extern const double* restrict d2Ad;
+
+/************************************************************/
+/* 1D double-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_1d_d (UBspline_1d_d * restrict spline, 
+		    double x, double* restrict val)
+{
+  x -= spline->x_grid.start;
+  double u = x*spline->x_grid.delta_inv;
+  double ipart, t;
+  t = modf (u, &ipart);
+  int i = (int) ipart;
+  
+  double tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
+     coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
+     coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
+     coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
+}
+
+/* Value and first derivative */
+inline void
+eval_UBspline_1d_d_vg (UBspline_1d_d * restrict spline, double x, 
+		     double* restrict val, double* restrict grad)
+{
+  x -= spline->x_grid.start;
+  double u = x*spline->x_grid.delta_inv;
+  double ipart, t;
+  t = modf (u, &ipart);
+  int i = (int) ipart;
+  
+  double tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
+     coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
+     coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
+     coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
+     coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
+     coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
+     coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
+}
+/* Value, first derivative, and second derivative */
+inline void
+eval_UBspline_1d_d_vgl (UBspline_1d_d * restrict spline, double x, 
+			double* restrict val, double* restrict grad,
+			double* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  double u = x*spline->x_grid.delta_inv;
+  double ipart, t;
+  t = modf (u, &ipart);
+  int i = (int) ipart;
+  
+  double tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
+     coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
+     coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
+     coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
+     coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
+     coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
+     coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
+  *lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv * 
+    (coefs[i+0]*(d2Ad[ 2]*tp[2] + d2Ad[ 3]*tp[3])+
+     coefs[i+1]*(d2Ad[ 6]*tp[2] + d2Ad[ 7]*tp[3])+
+     coefs[i+2]*(d2Ad[10]*tp[2] + d2Ad[11]*tp[3])+
+     coefs[i+3]*(d2Ad[14]*tp[2] + d2Ad[15]*tp[3]));
+}
+
+inline void
+eval_UBspline_1d_d_vgh (UBspline_1d_d * restrict spline, double x, 
+			double* restrict val, double* restrict grad,
+			double* restrict hess)
+{
+  eval_UBspline_1d_d_vgl (spline, x, val, grad, hess);
+}
+
+
+/************************************************************/
+/* 2D double-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_2d_d (UBspline_2d_d * restrict spline, 
+		    double x, double y, double* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+
+  b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+	  a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+	  a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+	  a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+#undef C
+
+}
+
+
+/* Value and gradient */
+inline void
+eval_UBspline_2d_d_vg (UBspline_2d_d * restrict spline, 
+		       double x, double y, 
+		       double* restrict val, double* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4], da[4], db[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0]  = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]  = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]  = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]  = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0] = (dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1] = (dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2] = (dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3] = (dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+
+  b[0]  = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+#undef C
+
+}
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_2d_d_vgl (UBspline_2d_d * restrict spline, 
+			double x, double y, double* restrict val, 
+			double* restrict grad, double* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = spline->y_grid.delta_inv *
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+  *lapl   = 
+    spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+      a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+      a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+     a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) + 
+    spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+     (d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+      d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+      d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+      d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  
+#undef C
+
+}
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_2d_d_vgh (UBspline_2d_d * restrict spline, 
+			double x, double y, double* restrict val, 
+			double* restrict grad, double* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]   = (  Ad[ 0]*tpy[0] +   Ad[ 1]*tpy[1] +  Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]   = (  Ad[ 4]*tpy[0] +   Ad[ 5]*tpy[1] +  Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]   = (  Ad[ 8]*tpy[0] +   Ad[ 9]*tpy[1] +  Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]   = (  Ad[12]*tpy[0] +   Ad[13]*tpy[1] +  Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0]  = ( dAd[ 1]*tpy[1] +  dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1]  = ( dAd[ 5]*tpy[1] +  dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2]  = ( dAd[ 9]*tpy[1] +  dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3]  = ( dAd[13]*tpy[1] +  dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (  a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+       a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+       a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+       a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    ( da[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+      da[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+      da[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+      da[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[1] = spline->y_grid.delta_inv *
+    (  a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+       a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+       a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+       a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+    (d2a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+     d2a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+     d2a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+     d2a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
+    ( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+      da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+      da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+      da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[3] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (  a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+       a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+       a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+       a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
+  hess[2] = hess[1];
+  
+#undef C
+
+}
+
+
+/************************************************************/
+/* 3D double-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_3d_d (UBspline_3d_d * restrict spline, 
+		    double x, double y, double z,
+		    double* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modf (uy, &iparty);  int iy = (int) iparty;
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz;
+
+
+
+  
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+
+  b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+
+  c[0] = (Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1] = (Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2] = (Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3] = (Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  *val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
+		b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
+		b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
+		b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
+	  a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
+		b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
+		b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
+		b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
+	  a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
+		b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
+		b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
+		b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
+	  a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
+		b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
+		b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
+		b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
+#undef P
+
+}
+
+/* Value and gradient */
+inline void
+eval_UBspline_3d_d_vg (UBspline_3d_d * restrict spline, 
+			double x, double y, double z,
+			double* restrict val, double* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modf (uy, &iparty);  int iy = (int) iparty; 
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    cP[16], bcP[4], dbcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+
+  c[0]  = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1]  = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2]  = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3]  = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
+  dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
+  dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
+  dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  *val    = ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+  grad[0] = spline->x_grid.delta_inv * 
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv * 
+    (a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
+	   b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
+	   b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
+	   b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
+     a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
+	   b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
+	   b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
+	   b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
+     a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
+	   b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
+	   b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
+	   b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
+     a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
+	   b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
+	   b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
+	   b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
+#undef P
+
+}
+
+
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_3d_d_vgl (UBspline_3d_d * restrict spline, 
+			double x, double y, double z,
+			double* restrict val, double* restrict grad, double* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modf (uy, &iparty);  int iy = (int) iparty; 
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4], cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+
+  c[0]  = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1]  = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2]  = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3]  = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
+  dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
+  dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
+  dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
+  d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
+  d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
+  d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
+  d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+
+
+  *val    = 
+    ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv * 
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+
+  *lapl = 
+    spline->x_grid.delta_inv * spline->x_grid.delta_inv * 
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+    
+    + spline->y_grid.delta_inv * spline->y_grid.delta_inv * 
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+    
+    + spline->z_grid.delta_inv * spline->z_grid.delta_inv * 
+    (a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+    
+	   b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
+	   b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
+	   b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
+     a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
+	   b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
+	   b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
+	   b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
+     a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
+	   b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
+	   b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
+	   b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
+     a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
+	   b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
+	   b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
+	   b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
+#undef P
+
+}
+
+
+
+
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_3d_d_vgh (UBspline_3d_d * restrict spline, 
+			double x, double y, double z,
+			double* restrict val, double* restrict grad, double* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;  
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
+  uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
+  uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modf (uy, &iparty);  int iy = (int) iparty;
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz;
+
+//   if ((ix >= spline->x_grid.num))    x = spline->x_grid.num;
+//   if ((ix < 0))                      x = 0;                 
+//   if ((iy >= spline->y_grid.num))    y = spline->y_grid.num;
+//   if ((iy < 0))                      y = 0;                 
+//   if ((iz >= spline->z_grid.num))    z = spline->z_grid.num;
+//   if ((iz < 0))                      z = 0;                 
+
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4], cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
+    d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+
+  c[0]  = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1]  = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2]  = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3]  = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
+  dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
+  dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
+  dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
+  d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
+  d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
+  d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
+  d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+  int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
+//   if (offmax > spline->coef_size) {
+//      fprintf (stderr, "Outside bounds in spline evalutation.\n"
+// 	      "offmax = %d  csize = %d\n", offmax, spline->csize);
+//      fprintf (stderr, "ix=%d   iy=%d   iz=%d\n", ix,iy,iz);
+//   }
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
+  d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
+  d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
+  d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
+  d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
+  d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
+  d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
+  d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
+  d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
+  d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
+  d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
+  d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
+  d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
+  d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
+  d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
+  d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
+  bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
+  bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
+  bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+  
+  dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
+  dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
+  dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
+  dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
+
+  *val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv *
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv *
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+  // d2x
+  hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
+  // dx dy
+  hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
+    (da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
+  hess[3] = hess[1];
+  // dx dz;
+  hess[2] = spline->x_grid.delta_inv * spline->z_grid.delta_inv *
+    (da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
+  hess[6] = hess[2];
+  // d2y
+  hess[4] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
+  // dy dz
+  hess[5] = spline->y_grid.delta_inv * spline->z_grid.delta_inv *
+    (a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
+  hess[7] = hess[5];
+  // d2z
+  hess[8] = spline->z_grid.delta_inv * spline->z_grid.delta_inv *
+    (a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
+#undef P
+
+}
+
+#endif
--- a/src/einspline/bspline_eval_std_s.h
+++ b/src/einspline/bspline_eval_std_s.h
@ -0,0 +1,931 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_EVAL_STD_S_H
+#define BSPLINE_EVAL_STD_S_H
+
+#include <math.h>
+#include <stdio.h>
+
+extern const float* restrict   Af;
+extern const float* restrict  dAf;
+extern const float* restrict d2Af;
+
+/************************************************************/
+/* 1D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_1d_s (UBspline_1d_s * restrict spline, 
+		    double x, float* restrict val)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+}
+
+/* Value and first derivative */
+inline void
+eval_UBspline_1d_s_vg (UBspline_1d_s * restrict spline, double x, 
+		     float* restrict val, float* restrict grad)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
+     coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
+     coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
+     coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
+}
+/* Value, first derivative, and second derivative */
+inline void
+eval_UBspline_1d_s_vgl (UBspline_1d_s * restrict spline, double x, 
+			float* restrict val, float* restrict grad,
+			float* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  float u = x*spline->x_grid.delta_inv;
+  float ipart, t;
+  t = modff (u, &ipart);
+  int i = (int) ipart;
+  
+  float tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Af[ 0]*tp[0] + Af[ 1]*tp[1] + Af[ 2]*tp[2] + Af[ 3]*tp[3])+
+     coefs[i+1]*(Af[ 4]*tp[0] + Af[ 5]*tp[1] + Af[ 6]*tp[2] + Af[ 7]*tp[3])+
+     coefs[i+2]*(Af[ 8]*tp[0] + Af[ 9]*tp[1] + Af[10]*tp[2] + Af[11]*tp[3])+
+     coefs[i+3]*(Af[12]*tp[0] + Af[13]*tp[1] + Af[14]*tp[2] + Af[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAf[ 1]*tp[1] + dAf[ 2]*tp[2] + dAf[ 3]*tp[3])+
+     coefs[i+1]*(dAf[ 5]*tp[1] + dAf[ 6]*tp[2] + dAf[ 7]*tp[3])+
+     coefs[i+2]*(dAf[ 9]*tp[1] + dAf[10]*tp[2] + dAf[11]*tp[3])+
+     coefs[i+3]*(dAf[13]*tp[1] + dAf[14]*tp[2] + dAf[15]*tp[3]));
+  *lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv * 
+    (coefs[i+0]*(d2Af[ 2]*tp[2] + d2Af[ 3]*tp[3])+
+     coefs[i+1]*(d2Af[ 6]*tp[2] + d2Af[ 7]*tp[3])+
+     coefs[i+2]*(d2Af[10]*tp[2] + d2Af[11]*tp[3])+
+     coefs[i+3]*(d2Af[14]*tp[2] + d2Af[15]*tp[3]));
+}
+
+inline void
+eval_UBspline_1d_s_vgh (UBspline_1d_s * restrict spline, double x, 
+			float* restrict val, float* restrict grad,
+			float* restrict hess)
+{
+  eval_UBspline_1d_s_vgl (spline, x, val, grad, hess);
+}
+
+/************************************************************/
+/* 2D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_2d_s (UBspline_2d_s * restrict spline, 
+		    double x, double y, float* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
+
+  b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+	  a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+	  a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+	  a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+#undef C
+
+}
+
+
+/* Value and gradient */
+inline void
+eval_UBspline_2d_s_vg (UBspline_2d_s * restrict spline, 
+		       double x, double y, 
+		       float* restrict val, float* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4], da[4], db[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0]  = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]  = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]  = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]  = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0] = (dAf[ 1]*tpx[1] + dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1] = (dAf[ 5]*tpx[1] + dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2] = (dAf[ 9]*tpx[1] + dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3] = (dAf[13]*tpx[1] + dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+
+  b[0]  = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+#undef C
+
+}
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_2d_s_vgl (UBspline_2d_s * restrict spline, 
+			double x, double y, float* restrict val, 
+			float* restrict grad, float* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = spline->y_grid.delta_inv *
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+  *lapl   = 
+    spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+     a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+     a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+     a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) + 
+    spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+     (d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+      d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+      d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+      d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  
+#undef C
+
+}
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_2d_s_vgh (UBspline_2d_s * restrict spline, 
+			double x, double y, float* restrict val, 
+			float* restrict grad, float* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float ipartx, iparty, tx, ty;
+  tx = modff (ux, &ipartx);
+  ty = modff (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  float tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]   = (  Af[ 0]*tpy[0] +   Af[ 1]*tpy[1] +  Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]   = (  Af[ 4]*tpy[0] +   Af[ 5]*tpy[1] +  Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]   = (  Af[ 8]*tpy[0] +   Af[ 9]*tpy[1] +  Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]   = (  Af[12]*tpy[0] +   Af[13]*tpy[1] +  Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0]  = ( dAf[ 1]*tpy[1] +  dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1]  = ( dAf[ 5]*tpy[1] +  dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2]  = ( dAf[ 9]*tpy[1] +  dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3]  = ( dAf[13]*tpy[1] +  dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (  a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+       a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+       a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+       a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    ( da[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+      da[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+      da[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+      da[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[1] = spline->y_grid.delta_inv *
+    (  a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+       a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+       a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+       a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+    (d2a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+     d2a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+     d2a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+     d2a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
+    ( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+      da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+      da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+      da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[3] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (  a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+       a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+       a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+       a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
+  hess[2] = hess[1];
+  
+#undef C
+
+}
+
+
+/************************************************************/
+/* 3D single-precision, real evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_3d_s (UBspline_3d_s * restrict spline, 
+		    double x, double y, double z,
+		    float* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modff (uy, &iparty);  int iy = (int) iparty;
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz;
+
+
+
+  
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0] = (Af[ 0]*tpx[0] + Af[ 1]*tpx[1] + Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1] = (Af[ 4]*tpx[0] + Af[ 5]*tpx[1] + Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2] = (Af[ 8]*tpx[0] + Af[ 9]*tpx[1] + Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3] = (Af[12]*tpx[0] + Af[13]*tpx[1] + Af[14]*tpx[2] + Af[15]*tpx[3]);
+
+  b[0] = (Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1] = (Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2] = (Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3] = (Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+
+  c[0] = (Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1] = (Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2] = (Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3] = (Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  *val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
+		b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
+		b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
+		b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
+	  a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
+		b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
+		b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
+		b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
+	  a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
+		b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
+		b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
+		b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
+	  a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
+		b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
+		b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
+		b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
+#undef P
+
+}
+
+/* Value and gradient */
+inline void
+eval_UBspline_3d_s_vg (UBspline_3d_s * restrict spline, 
+			double x, double y, double z,
+			float* restrict val, float* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modff (uy, &iparty);  int iy = (int) iparty; 
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    cP[16], bcP[4], dbcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+
+  c[0]  = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1]  = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2]  = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3]  = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
+  dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
+  dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
+  dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  *val    = ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+  grad[0] = spline->x_grid.delta_inv * 
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv * 
+    (a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
+	   b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
+	   b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
+	   b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
+     a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
+	   b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
+	   b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
+	   b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
+     a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
+	   b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
+	   b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
+	   b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
+     a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
+	   b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
+	   b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
+	   b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
+#undef P
+
+}
+
+
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_3d_s_vgl (UBspline_3d_s * restrict spline, 
+			double x, double y, double z,
+			float* restrict val, float* restrict grad, float* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modff (uy, &iparty);  int iy = (int) iparty; 
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4], cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+
+  c[0]  = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1]  = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2]  = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3]  = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
+  dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
+  dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
+  dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
+  d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
+  d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
+  d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
+  d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+
+
+  *val    = 
+    ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv * 
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+
+  *lapl = 
+    spline->x_grid.delta_inv * spline->x_grid.delta_inv * 
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+    
+    + spline->y_grid.delta_inv * spline->y_grid.delta_inv * 
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+    
+    + spline->z_grid.delta_inv * spline->z_grid.delta_inv * 
+    (a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+    
+	   b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
+	   b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
+	   b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
+     a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
+	   b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
+	   b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
+	   b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
+     a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
+	   b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
+	   b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
+	   b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
+     a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
+	   b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
+	   b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
+	   b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
+#undef P
+
+}
+
+
+
+
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_3d_s_vgh (UBspline_3d_s * restrict spline, 
+			double x, double y, double z,
+			float* restrict val, float* restrict grad, float* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;  
+  z -= spline->z_grid.start;
+  float ux = x*spline->x_grid.delta_inv;
+  float uy = y*spline->y_grid.delta_inv;
+  float uz = z*spline->z_grid.delta_inv;
+  ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
+  uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
+  uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
+  float ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modff (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modff (uy, &iparty);  int iy = (int) iparty;
+  tz = modff (uz, &ipartz);  int iz = (int) ipartz;
+
+//   if ((ix >= spline->x_grid.num))    x = spline->x_grid.num;
+//   if ((ix < 0))                      x = 0;                 
+//   if ((iy >= spline->y_grid.num))    y = spline->y_grid.num;
+//   if ((iy < 0))                      y = 0;                 
+//   if ((iz >= spline->z_grid.num))    z = spline->z_grid.num;
+//   if ((iz < 0))                      z = 0;                 
+
+  float tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4], cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
+    d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  float* restrict coefs = spline->coefs;
+
+  a[0]   = (  Af[ 0]*tpx[0] +   Af[ 1]*tpx[1] +  Af[ 2]*tpx[2] + Af[ 3]*tpx[3]);
+  a[1]   = (  Af[ 4]*tpx[0] +   Af[ 5]*tpx[1] +  Af[ 6]*tpx[2] + Af[ 7]*tpx[3]);
+  a[2]   = (  Af[ 8]*tpx[0] +   Af[ 9]*tpx[1] +  Af[10]*tpx[2] + Af[11]*tpx[3]);
+  a[3]   = (  Af[12]*tpx[0] +   Af[13]*tpx[1] +  Af[14]*tpx[2] + Af[15]*tpx[3]);
+  da[0]  = ( dAf[ 1]*tpx[1] +  dAf[ 2]*tpx[2] + dAf[ 3]*tpx[3]);
+  da[1]  = ( dAf[ 5]*tpx[1] +  dAf[ 6]*tpx[2] + dAf[ 7]*tpx[3]);
+  da[2]  = ( dAf[ 9]*tpx[1] +  dAf[10]*tpx[2] + dAf[11]*tpx[3]);
+  da[3]  = ( dAf[13]*tpx[1] +  dAf[14]*tpx[2] + dAf[15]*tpx[3]);
+  d2a[0] = (d2Af[ 2]*tpx[2] + d2Af[ 3]*tpx[3]);
+  d2a[1] = (d2Af[ 6]*tpx[2] + d2Af[ 7]*tpx[3]);
+  d2a[2] = (d2Af[10]*tpx[2] + d2Af[11]*tpx[3]);
+  d2a[3] = (d2Af[14]*tpx[2] + d2Af[15]*tpx[3]);
+
+  b[0]  = ( Af[ 0]*tpy[0] + Af[ 1]*tpy[1] + Af[ 2]*tpy[2] + Af[ 3]*tpy[3]);
+  b[1]  = ( Af[ 4]*tpy[0] + Af[ 5]*tpy[1] + Af[ 6]*tpy[2] + Af[ 7]*tpy[3]);
+  b[2]  = ( Af[ 8]*tpy[0] + Af[ 9]*tpy[1] + Af[10]*tpy[2] + Af[11]*tpy[3]);
+  b[3]  = ( Af[12]*tpy[0] + Af[13]*tpy[1] + Af[14]*tpy[2] + Af[15]*tpy[3]);
+  db[0] = (dAf[ 1]*tpy[1] + dAf[ 2]*tpy[2] + dAf[ 3]*tpy[3]);
+  db[1] = (dAf[ 5]*tpy[1] + dAf[ 6]*tpy[2] + dAf[ 7]*tpy[3]);
+  db[2] = (dAf[ 9]*tpy[1] + dAf[10]*tpy[2] + dAf[11]*tpy[3]);
+  db[3] = (dAf[13]*tpy[1] + dAf[14]*tpy[2] + dAf[15]*tpy[3]);
+  d2b[0] = (d2Af[ 2]*tpy[2] + d2Af[ 3]*tpy[3]);
+  d2b[1] = (d2Af[ 6]*tpy[2] + d2Af[ 7]*tpy[3]);
+  d2b[2] = (d2Af[10]*tpy[2] + d2Af[11]*tpy[3]);
+  d2b[3] = (d2Af[14]*tpy[2] + d2Af[15]*tpy[3]);
+
+  c[0]  = ( Af[ 0]*tpz[0] + Af[ 1]*tpz[1] + Af[ 2]*tpz[2] + Af[ 3]*tpz[3]);
+  c[1]  = ( Af[ 4]*tpz[0] + Af[ 5]*tpz[1] + Af[ 6]*tpz[2] + Af[ 7]*tpz[3]);
+  c[2]  = ( Af[ 8]*tpz[0] + Af[ 9]*tpz[1] + Af[10]*tpz[2] + Af[11]*tpz[3]);
+  c[3]  = ( Af[12]*tpz[0] + Af[13]*tpz[1] + Af[14]*tpz[2] + Af[15]*tpz[3]);
+  dc[0] = (dAf[ 1]*tpz[1] + dAf[ 2]*tpz[2] + dAf[ 3]*tpz[3]);
+  dc[1] = (dAf[ 5]*tpz[1] + dAf[ 6]*tpz[2] + dAf[ 7]*tpz[3]);
+  dc[2] = (dAf[ 9]*tpz[1] + dAf[10]*tpz[2] + dAf[11]*tpz[3]);
+  dc[3] = (dAf[13]*tpz[1] + dAf[14]*tpz[2] + dAf[15]*tpz[3]);
+  d2c[0] = (d2Af[ 2]*tpz[2] + d2Af[ 3]*tpz[3]);
+  d2c[1] = (d2Af[ 6]*tpz[2] + d2Af[ 7]*tpz[3]);
+  d2c[2] = (d2Af[10]*tpz[2] + d2Af[11]*tpz[3]);
+  d2c[3] = (d2Af[14]*tpz[2] + d2Af[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+  int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
+//   if (offmax > spline->coef_size) {
+//      fprintf (stderr, "Outside bounds in spline evalutation.\n"
+// 	      "offmax = %d  csize = %d\n", offmax, spline->csize);
+//      fprintf (stderr, "ix=%d   iy=%d   iz=%d\n", ix,iy,iz);
+//   }
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
+  d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
+  d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
+  d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
+  d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
+  d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
+  d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
+  d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
+  d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
+  d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
+  d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
+  d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
+  d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
+  d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
+  d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
+  d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
+  bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
+  bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
+  bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+  
+  dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
+  dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
+  dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
+  dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
+
+  *val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv *
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv *
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+  // d2x
+  hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
+  // dx dy
+  hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
+    (da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
+  hess[3] = hess[1];
+  // dx dz;
+  hess[2] = spline->x_grid.delta_inv * spline->z_grid.delta_inv *
+    (da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
+  hess[6] = hess[2];
+  // d2y
+  hess[4] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
+  // dy dz
+  hess[5] = spline->y_grid.delta_inv * spline->z_grid.delta_inv *
+    (a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
+  hess[7] = hess[5];
+  // d2z
+  hess[8] = spline->z_grid.delta_inv * spline->z_grid.delta_inv *
+    (a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
+#undef P
+
+}
+
+#endif
--- a/src/einspline/bspline_eval_std_z.h
+++ b/src/einspline/bspline_eval_std_z.h
@ -0,0 +1,939 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_EVAL_STD_Z_H
+#define BSPLINE_EVAL_STD_Z_H
+
+#include <math.h>
+#include <stdio.h>
+
+extern const double* restrict   Ad;
+extern const double* restrict  dAd;
+extern const double* restrict d2Ad;
+
+/************************************************************/
+/* 1D double-precision, complex evaulation functions        */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_1d_z (UBspline_1d_z * restrict spline, 
+		    double x, complex_double* restrict val)
+{
+  x -= spline->x_grid.start;
+  double u = x*spline->x_grid.delta_inv;
+  double ipart, t;
+  t = modf (u, &ipart);
+  int i = (int) ipart;
+  
+  double tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
+     coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
+     coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
+     coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
+}
+
+/* Value and first derivative */
+inline void
+eval_UBspline_1d_z_vg (UBspline_1d_z * restrict spline, double x, 
+		       complex_double* restrict val, 
+		       complex_double* restrict grad)
+{
+  x -= spline->x_grid.start;
+  double u = x*spline->x_grid.delta_inv;
+  double ipart, t;
+  t = modf (u, &ipart);
+  int i = (int) ipart;
+  
+  double tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
+     coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
+     coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
+     coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
+     coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
+     coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
+     coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
+}
+/* Value, first derivative, and second derivative */
+inline void
+eval_UBspline_1d_z_vgl (UBspline_1d_z * restrict spline, double x, 
+			complex_double* restrict val, complex_double* restrict grad,
+			complex_double* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  double u = x*spline->x_grid.delta_inv;
+  double ipart, t;
+  t = modf (u, &ipart);
+  int i = (int) ipart;
+  
+  double tp[4];
+  tp[0] = t*t*t;  tp[1] = t*t;  tp[2] = t;  tp[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  *val = 
+    (coefs[i+0]*(Ad[ 0]*tp[0] + Ad[ 1]*tp[1] + Ad[ 2]*tp[2] + Ad[ 3]*tp[3])+
+     coefs[i+1]*(Ad[ 4]*tp[0] + Ad[ 5]*tp[1] + Ad[ 6]*tp[2] + Ad[ 7]*tp[3])+
+     coefs[i+2]*(Ad[ 8]*tp[0] + Ad[ 9]*tp[1] + Ad[10]*tp[2] + Ad[11]*tp[3])+
+     coefs[i+3]*(Ad[12]*tp[0] + Ad[13]*tp[1] + Ad[14]*tp[2] + Ad[15]*tp[3]));
+  *grad = spline->x_grid.delta_inv * 
+    (coefs[i+0]*(dAd[ 1]*tp[1] + dAd[ 2]*tp[2] + dAd[ 3]*tp[3])+
+     coefs[i+1]*(dAd[ 5]*tp[1] + dAd[ 6]*tp[2] + dAd[ 7]*tp[3])+
+     coefs[i+2]*(dAd[ 9]*tp[1] + dAd[10]*tp[2] + dAd[11]*tp[3])+
+     coefs[i+3]*(dAd[13]*tp[1] + dAd[14]*tp[2] + dAd[15]*tp[3]));
+  *lapl = spline->x_grid.delta_inv * spline->x_grid.delta_inv * 
+    (coefs[i+0]*(d2Ad[ 2]*tp[2] + d2Ad[ 3]*tp[3])+
+     coefs[i+1]*(d2Ad[ 6]*tp[2] + d2Ad[ 7]*tp[3])+
+     coefs[i+2]*(d2Ad[10]*tp[2] + d2Ad[11]*tp[3])+
+     coefs[i+3]*(d2Ad[14]*tp[2] + d2Ad[15]*tp[3]));
+}
+
+inline void
+eval_UBspline_1d_z_vgh (UBspline_1d_z * restrict spline, double x, 
+			complex_double* restrict val, 
+			complex_double* restrict grad,
+			complex_double* restrict hess)
+{
+  eval_UBspline_1d_z_vgh (spline, x, val, grad, hess);
+}
+/************************************************************/
+/* 2D double-precision, complex evaulation functions        */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_2d_z (UBspline_2d_z * restrict spline, 
+		    double x, double y, complex_double* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+
+  b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val = (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+	  a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+	  a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+	  a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+#undef C
+
+}
+
+
+/* Value and gradient */
+inline void
+eval_UBspline_2d_z_vg (UBspline_2d_z * restrict spline, 
+		       double x, double y, 
+		       complex_double* restrict val, 
+		       complex_double* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4], da[4], db[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0]  = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]  = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]  = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]  = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0] = (dAd[ 1]*tpx[1] + dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1] = (dAd[ 5]*tpx[1] + dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2] = (dAd[ 9]*tpx[1] + dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3] = (dAd[13]*tpx[1] + dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+
+  b[0]  = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+#undef C
+
+}
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_2d_z_vgl (UBspline_2d_z * restrict spline, 
+			double x, double y, complex_double* restrict val, 
+			complex_double* restrict grad, 
+			complex_double* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+     da[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+     da[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+     da[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  grad[1] = spline->y_grid.delta_inv *
+    (a[0]*(C(0,0)*db[0]+C(0,1)*db[1]+C(0,2)*db[2]+C(0,3)*db[3])+
+     a[1]*(C(1,0)*db[0]+C(1,1)*db[1]+C(1,2)*db[2]+C(1,3)*db[3])+
+     a[2]*(C(2,0)*db[0]+C(2,1)*db[1]+C(2,2)*db[2]+C(2,3)*db[3])+
+     a[3]*(C(3,0)*db[0]+C(3,1)*db[1]+C(3,2)*db[2]+C(3,3)*db[3]));
+  *lapl   = 
+    spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+     a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+     a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+     a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3])) + 
+    spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+     (d2a[0]*(C(0,0)*b[0]+C(0,1)*b[1]+C(0,2)*b[2]+C(0,3)*b[3])+
+      d2a[1]*(C(1,0)*b[0]+C(1,1)*b[1]+C(1,2)*b[2]+C(1,3)*b[3])+
+      d2a[2]*(C(2,0)*b[0]+C(2,1)*b[1]+C(2,2)*b[2]+C(2,3)*b[3])+
+      d2a[3]*(C(3,0)*b[0]+C(3,1)*b[1]+C(3,2)*b[2]+C(3,3)*b[3]));
+  
+#undef C
+
+}
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_2d_z_vgh (UBspline_2d_z * restrict spline, 
+			double x, double y, complex_double* restrict val, 
+			complex_double* restrict grad, 
+			complex_double* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double ipartx, iparty, tx, ty;
+  tx = modf (ux, &ipartx);
+  ty = modf (uy, &iparty);
+  int ix = (int) ipartx;
+  int iy = (int) iparty;
+  
+  double tpx[4], tpy[4], a[4], b[4], da[4], db[4], d2a[4], d2b[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]   = (  Ad[ 0]*tpy[0] +   Ad[ 1]*tpy[1] +  Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]   = (  Ad[ 4]*tpy[0] +   Ad[ 5]*tpy[1] +  Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]   = (  Ad[ 8]*tpy[0] +   Ad[ 9]*tpy[1] +  Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]   = (  Ad[12]*tpy[0] +   Ad[13]*tpy[1] +  Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0]  = ( dAd[ 1]*tpy[1] +  dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1]  = ( dAd[ 5]*tpy[1] +  dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2]  = ( dAd[ 9]*tpy[1] +  dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3]  = ( dAd[13]*tpy[1] +  dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+  
+  int xs = spline->x_stride;
+#define C(i,j) coefs[(ix+(i))*xs+iy+(j)]
+  *val =    
+    (  a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+       a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+       a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+       a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[0] = spline->x_grid.delta_inv *
+    ( da[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+      da[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+      da[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+      da[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  grad[1] = spline->y_grid.delta_inv *
+    (  a[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+       a[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+       a[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+       a[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+    (d2a[0]*(C(0,0)*  b[0]+C(0,1)*  b[1]+C(0,2)*  b[2]+C(0,3)*  b[3])+
+     d2a[1]*(C(1,0)*  b[0]+C(1,1)*  b[1]+C(1,2)*  b[2]+C(1,3)*  b[3])+
+     d2a[2]*(C(2,0)*  b[0]+C(2,1)*  b[1]+C(2,2)*  b[2]+C(2,3)*  b[3])+
+     d2a[3]*(C(3,0)*  b[0]+C(3,1)*  b[1]+C(3,2)*  b[2]+C(3,3)*  b[3]));
+  hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
+    ( da[0]*(C(0,0)* db[0]+C(0,1)* db[1]+C(0,2)* db[2]+C(0,3)* db[3])+
+      da[1]*(C(1,0)* db[0]+C(1,1)* db[1]+C(1,2)* db[2]+C(1,3)* db[3])+
+      da[2]*(C(2,0)* db[0]+C(2,1)* db[1]+C(2,2)* db[2]+C(2,3)* db[3])+
+      da[3]*(C(3,0)* db[0]+C(3,1)* db[1]+C(3,2)* db[2]+C(3,3)* db[3]));
+  hess[3] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (  a[0]*(C(0,0)*d2b[0]+C(0,1)*d2b[1]+C(0,2)*d2b[2]+C(0,3)*d2b[3])+
+       a[1]*(C(1,0)*d2b[0]+C(1,1)*d2b[1]+C(1,2)*d2b[2]+C(1,3)*d2b[3])+
+       a[2]*(C(2,0)*d2b[0]+C(2,1)*d2b[1]+C(2,2)*d2b[2]+C(2,3)*d2b[3])+
+       a[3]*(C(3,0)*d2b[0]+C(3,1)*d2b[1]+C(3,2)*d2b[2]+C(3,3)*d2b[3]));
+  hess[2] = hess[1];
+  
+#undef C
+
+}
+
+
+/************************************************************/
+/* 3D double-precision, complex evaulation functions           */
+/************************************************************/
+
+/* Value only */
+inline void
+eval_UBspline_3d_z (UBspline_3d_z * restrict spline, 
+		    double x, double y, double z,
+		    complex_double* restrict val)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modf (uy, &iparty);  int iy = (int) iparty;
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz;
+  
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0] = (Ad[ 0]*tpx[0] + Ad[ 1]*tpx[1] + Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1] = (Ad[ 4]*tpx[0] + Ad[ 5]*tpx[1] + Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2] = (Ad[ 8]*tpx[0] + Ad[ 9]*tpx[1] + Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3] = (Ad[12]*tpx[0] + Ad[13]*tpx[1] + Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+
+  b[0] = (Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1] = (Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2] = (Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3] = (Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+
+  c[0] = (Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1] = (Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2] = (Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3] = (Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  *val = (a[0]*(b[0]*(P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3])+
+		b[1]*(P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3])+
+		b[2]*(P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3])+
+		b[3]*(P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]))+
+	  a[1]*(b[0]*(P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3])+
+		b[1]*(P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3])+
+		b[2]*(P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3])+
+		b[3]*(P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]))+
+	  a[2]*(b[0]*(P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3])+
+		b[1]*(P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3])+
+		b[2]*(P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3])+
+		b[3]*(P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]))+
+	  a[3]*(b[0]*(P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3])+
+		b[1]*(P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3])+
+		b[2]*(P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3])+
+		b[3]*(P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3])));
+#undef P
+
+}
+
+/* Value and gradient */
+inline void
+eval_UBspline_3d_z_vg (UBspline_3d_z * restrict spline, 
+		       double x, double y, double z,
+		       complex_double* restrict val, 
+		       complex_double* restrict grad)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modf (uy, &iparty);  int iy = (int) iparty; 
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4];
+  complex_double cP[16], bcP[4], dbcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+
+  c[0]  = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1]  = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2]  = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3]  = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
+  dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
+  dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
+  dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  *val    = ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+  grad[0] = spline->x_grid.delta_inv * 
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv * 
+    (a[0]*(b[0]*(P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3])+
+	   b[1]*(P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3])+
+	   b[2]*(P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3])+
+	   b[3]*(P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]))+
+     a[1]*(b[0]*(P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3])+
+	   b[1]*(P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3])+
+	   b[2]*(P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3])+
+	   b[3]*(P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]))+
+     a[2]*(b[0]*(P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3])+
+	   b[1]*(P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3])+
+	   b[2]*(P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3])+
+	   b[3]*(P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]))+
+     a[3]*(b[0]*(P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3])+
+	   b[1]*(P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3])+
+	   b[2]*(P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3])+
+	   b[3]*(P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3])));
+#undef P
+
+}
+
+
+
+/* Value, gradient, and laplacian */
+inline void
+eval_UBspline_3d_z_vgl (UBspline_3d_z * restrict spline, 
+			double x, double y, double z,
+			complex_double* restrict val, 
+			complex_double* restrict grad, 
+			complex_double* restrict lapl)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;  
+  ty = modf (uy, &iparty);  int iy = (int) iparty; 
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz; 
+  
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4];
+  complex_double cP[16], dcP[16], bcP[4], dbcP[4], d2bcP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+
+  c[0]  = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1]  = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2]  = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3]  = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
+  dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
+  dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
+  dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
+  d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
+  d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
+  d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
+  d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+
+
+  *val    = 
+    ( a[0]*bcP[0] +  a[1]*bcP[1] +  a[2]*bcP[2] +  a[3]*bcP[3]);
+
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0]*bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv * 
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv * 
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+
+  *lapl = 
+    spline->x_grid.delta_inv * spline->x_grid.delta_inv * 
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3])
+    
+    + spline->y_grid.delta_inv * spline->y_grid.delta_inv * 
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]) +
+    
+    + spline->z_grid.delta_inv * spline->z_grid.delta_inv * 
+    (a[0]*(b[0]*(P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3])+    
+	   b[1]*(P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3])+
+	   b[2]*(P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3])+
+	   b[3]*(P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]))+
+     a[1]*(b[0]*(P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3])+
+	   b[1]*(P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3])+
+	   b[2]*(P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3])+
+	   b[3]*(P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]))+
+     a[2]*(b[0]*(P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3])+
+	   b[1]*(P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3])+
+	   b[2]*(P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3])+
+	   b[3]*(P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]))+
+     a[3]*(b[0]*(P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3])+
+	   b[1]*(P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3])+
+	   b[2]*(P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3])+
+	   b[3]*(P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3])));
+#undef P
+
+}
+
+
+
+
+
+/* Value, gradient, and Hessian */
+inline void
+eval_UBspline_3d_z_vgh (UBspline_3d_z * restrict spline, 
+			double x, double y, double z,
+			complex_double* restrict val, 
+			complex_double* restrict grad, 
+			complex_double* restrict hess)
+{
+  x -= spline->x_grid.start;
+  y -= spline->y_grid.start;  
+  z -= spline->z_grid.start;
+  double ux = x*spline->x_grid.delta_inv;
+  double uy = y*spline->y_grid.delta_inv;
+  double uz = z*spline->z_grid.delta_inv;
+  ux = fmin (ux, (double)(spline->x_grid.num)-1.0e-5);
+  uy = fmin (uy, (double)(spline->y_grid.num)-1.0e-5);
+  uz = fmin (uz, (double)(spline->z_grid.num)-1.0e-5);
+  double ipartx, iparty, ipartz, tx, ty, tz;
+  tx = modf (ux, &ipartx);  int ix = (int) ipartx;
+  ty = modf (uy, &iparty);  int iy = (int) iparty;
+  tz = modf (uz, &ipartz);  int iz = (int) ipartz;
+
+//   if ((ix >= spline->x_grid.num))    x = spline->x_grid.num;
+//   if ((ix < 0))                      x = 0;                 
+//   if ((iy >= spline->y_grid.num))    y = spline->y_grid.num;
+//   if ((iy < 0))                      y = 0;                 
+//   if ((iz >= spline->z_grid.num))    z = spline->z_grid.num;
+//   if ((iz < 0))                      z = 0;                 
+
+  double tpx[4], tpy[4], tpz[4], a[4], b[4], c[4], da[4], db[4], dc[4], 
+    d2a[4], d2b[4], d2c[4];
+  complex_double cP[16], dcP[16], d2cP[16], bcP[4], dbcP[4],
+    d2bcP[4], dbdcP[4], bd2cP[4], bdcP[4];
+  tpx[0] = tx*tx*tx;  tpx[1] = tx*tx;  tpx[2] = tx;  tpx[3] = 1.0;
+  tpy[0] = ty*ty*ty;  tpy[1] = ty*ty;  tpy[2] = ty;  tpy[3] = 1.0;
+  tpz[0] = tz*tz*tz;  tpz[1] = tz*tz;  tpz[2] = tz;  tpz[3] = 1.0;
+  complex_double* restrict coefs = spline->coefs;
+
+  a[0]   = (  Ad[ 0]*tpx[0] +   Ad[ 1]*tpx[1] +  Ad[ 2]*tpx[2] + Ad[ 3]*tpx[3]);
+  a[1]   = (  Ad[ 4]*tpx[0] +   Ad[ 5]*tpx[1] +  Ad[ 6]*tpx[2] + Ad[ 7]*tpx[3]);
+  a[2]   = (  Ad[ 8]*tpx[0] +   Ad[ 9]*tpx[1] +  Ad[10]*tpx[2] + Ad[11]*tpx[3]);
+  a[3]   = (  Ad[12]*tpx[0] +   Ad[13]*tpx[1] +  Ad[14]*tpx[2] + Ad[15]*tpx[3]);
+  da[0]  = ( dAd[ 1]*tpx[1] +  dAd[ 2]*tpx[2] + dAd[ 3]*tpx[3]);
+  da[1]  = ( dAd[ 5]*tpx[1] +  dAd[ 6]*tpx[2] + dAd[ 7]*tpx[3]);
+  da[2]  = ( dAd[ 9]*tpx[1] +  dAd[10]*tpx[2] + dAd[11]*tpx[3]);
+  da[3]  = ( dAd[13]*tpx[1] +  dAd[14]*tpx[2] + dAd[15]*tpx[3]);
+  d2a[0] = (d2Ad[ 2]*tpx[2] + d2Ad[ 3]*tpx[3]);
+  d2a[1] = (d2Ad[ 6]*tpx[2] + d2Ad[ 7]*tpx[3]);
+  d2a[2] = (d2Ad[10]*tpx[2] + d2Ad[11]*tpx[3]);
+  d2a[3] = (d2Ad[14]*tpx[2] + d2Ad[15]*tpx[3]);
+
+  b[0]  = ( Ad[ 0]*tpy[0] + Ad[ 1]*tpy[1] + Ad[ 2]*tpy[2] + Ad[ 3]*tpy[3]);
+  b[1]  = ( Ad[ 4]*tpy[0] + Ad[ 5]*tpy[1] + Ad[ 6]*tpy[2] + Ad[ 7]*tpy[3]);
+  b[2]  = ( Ad[ 8]*tpy[0] + Ad[ 9]*tpy[1] + Ad[10]*tpy[2] + Ad[11]*tpy[3]);
+  b[3]  = ( Ad[12]*tpy[0] + Ad[13]*tpy[1] + Ad[14]*tpy[2] + Ad[15]*tpy[3]);
+  db[0] = (dAd[ 1]*tpy[1] + dAd[ 2]*tpy[2] + dAd[ 3]*tpy[3]);
+  db[1] = (dAd[ 5]*tpy[1] + dAd[ 6]*tpy[2] + dAd[ 7]*tpy[3]);
+  db[2] = (dAd[ 9]*tpy[1] + dAd[10]*tpy[2] + dAd[11]*tpy[3]);
+  db[3] = (dAd[13]*tpy[1] + dAd[14]*tpy[2] + dAd[15]*tpy[3]);
+  d2b[0] = (d2Ad[ 2]*tpy[2] + d2Ad[ 3]*tpy[3]);
+  d2b[1] = (d2Ad[ 6]*tpy[2] + d2Ad[ 7]*tpy[3]);
+  d2b[2] = (d2Ad[10]*tpy[2] + d2Ad[11]*tpy[3]);
+  d2b[3] = (d2Ad[14]*tpy[2] + d2Ad[15]*tpy[3]);
+
+  c[0]  = ( Ad[ 0]*tpz[0] + Ad[ 1]*tpz[1] + Ad[ 2]*tpz[2] + Ad[ 3]*tpz[3]);
+  c[1]  = ( Ad[ 4]*tpz[0] + Ad[ 5]*tpz[1] + Ad[ 6]*tpz[2] + Ad[ 7]*tpz[3]);
+  c[2]  = ( Ad[ 8]*tpz[0] + Ad[ 9]*tpz[1] + Ad[10]*tpz[2] + Ad[11]*tpz[3]);
+  c[3]  = ( Ad[12]*tpz[0] + Ad[13]*tpz[1] + Ad[14]*tpz[2] + Ad[15]*tpz[3]);
+  dc[0] = (dAd[ 1]*tpz[1] + dAd[ 2]*tpz[2] + dAd[ 3]*tpz[3]);
+  dc[1] = (dAd[ 5]*tpz[1] + dAd[ 6]*tpz[2] + dAd[ 7]*tpz[3]);
+  dc[2] = (dAd[ 9]*tpz[1] + dAd[10]*tpz[2] + dAd[11]*tpz[3]);
+  dc[3] = (dAd[13]*tpz[1] + dAd[14]*tpz[2] + dAd[15]*tpz[3]);
+  d2c[0] = (d2Ad[ 2]*tpz[2] + d2Ad[ 3]*tpz[3]);
+  d2c[1] = (d2Ad[ 6]*tpz[2] + d2Ad[ 7]*tpz[3]);
+  d2c[2] = (d2Ad[10]*tpz[2] + d2Ad[11]*tpz[3]);
+  d2c[3] = (d2Ad[14]*tpz[2] + d2Ad[15]*tpz[3]);
+  
+  int xs = spline->x_stride;
+  int ys = spline->y_stride;
+  int offmax = (ix+3)*xs + (iy+3)*ys + iz+3;
+//   if (offmax > spline->coef_size) {
+//      fprintf (stderr, "Outside bounds in spline evalutation.\n"
+// 	      "offmax = %d  csize = %d\n", offmax, spline->csize);
+//      fprintf (stderr, "ix=%d   iy=%d   iz=%d\n", ix,iy,iz);
+//   }
+#define P(i,j,k) coefs[(ix+(i))*xs+(iy+(j))*ys+(iz+(k))]
+  cP[ 0] = (P(0,0,0)*c[0]+P(0,0,1)*c[1]+P(0,0,2)*c[2]+P(0,0,3)*c[3]);
+  cP[ 1] = (P(0,1,0)*c[0]+P(0,1,1)*c[1]+P(0,1,2)*c[2]+P(0,1,3)*c[3]);
+  cP[ 2] = (P(0,2,0)*c[0]+P(0,2,1)*c[1]+P(0,2,2)*c[2]+P(0,2,3)*c[3]);
+  cP[ 3] = (P(0,3,0)*c[0]+P(0,3,1)*c[1]+P(0,3,2)*c[2]+P(0,3,3)*c[3]);
+  cP[ 4] = (P(1,0,0)*c[0]+P(1,0,1)*c[1]+P(1,0,2)*c[2]+P(1,0,3)*c[3]);
+  cP[ 5] = (P(1,1,0)*c[0]+P(1,1,1)*c[1]+P(1,1,2)*c[2]+P(1,1,3)*c[3]);
+  cP[ 6] = (P(1,2,0)*c[0]+P(1,2,1)*c[1]+P(1,2,2)*c[2]+P(1,2,3)*c[3]);
+  cP[ 7] = (P(1,3,0)*c[0]+P(1,3,1)*c[1]+P(1,3,2)*c[2]+P(1,3,3)*c[3]);
+  cP[ 8] = (P(2,0,0)*c[0]+P(2,0,1)*c[1]+P(2,0,2)*c[2]+P(2,0,3)*c[3]);
+  cP[ 9] = (P(2,1,0)*c[0]+P(2,1,1)*c[1]+P(2,1,2)*c[2]+P(2,1,3)*c[3]);
+  cP[10] = (P(2,2,0)*c[0]+P(2,2,1)*c[1]+P(2,2,2)*c[2]+P(2,2,3)*c[3]);
+  cP[11] = (P(2,3,0)*c[0]+P(2,3,1)*c[1]+P(2,3,2)*c[2]+P(2,3,3)*c[3]);
+  cP[12] = (P(3,0,0)*c[0]+P(3,0,1)*c[1]+P(3,0,2)*c[2]+P(3,0,3)*c[3]);
+  cP[13] = (P(3,1,0)*c[0]+P(3,1,1)*c[1]+P(3,1,2)*c[2]+P(3,1,3)*c[3]);
+  cP[14] = (P(3,2,0)*c[0]+P(3,2,1)*c[1]+P(3,2,2)*c[2]+P(3,2,3)*c[3]);
+  cP[15] = (P(3,3,0)*c[0]+P(3,3,1)*c[1]+P(3,3,2)*c[2]+P(3,3,3)*c[3]);
+
+  dcP[ 0] = (P(0,0,0)*dc[0]+P(0,0,1)*dc[1]+P(0,0,2)*dc[2]+P(0,0,3)*dc[3]);
+  dcP[ 1] = (P(0,1,0)*dc[0]+P(0,1,1)*dc[1]+P(0,1,2)*dc[2]+P(0,1,3)*dc[3]);
+  dcP[ 2] = (P(0,2,0)*dc[0]+P(0,2,1)*dc[1]+P(0,2,2)*dc[2]+P(0,2,3)*dc[3]);
+  dcP[ 3] = (P(0,3,0)*dc[0]+P(0,3,1)*dc[1]+P(0,3,2)*dc[2]+P(0,3,3)*dc[3]);
+  dcP[ 4] = (P(1,0,0)*dc[0]+P(1,0,1)*dc[1]+P(1,0,2)*dc[2]+P(1,0,3)*dc[3]);
+  dcP[ 5] = (P(1,1,0)*dc[0]+P(1,1,1)*dc[1]+P(1,1,2)*dc[2]+P(1,1,3)*dc[3]);
+  dcP[ 6] = (P(1,2,0)*dc[0]+P(1,2,1)*dc[1]+P(1,2,2)*dc[2]+P(1,2,3)*dc[3]);
+  dcP[ 7] = (P(1,3,0)*dc[0]+P(1,3,1)*dc[1]+P(1,3,2)*dc[2]+P(1,3,3)*dc[3]);
+  dcP[ 8] = (P(2,0,0)*dc[0]+P(2,0,1)*dc[1]+P(2,0,2)*dc[2]+P(2,0,3)*dc[3]);
+  dcP[ 9] = (P(2,1,0)*dc[0]+P(2,1,1)*dc[1]+P(2,1,2)*dc[2]+P(2,1,3)*dc[3]);
+  dcP[10] = (P(2,2,0)*dc[0]+P(2,2,1)*dc[1]+P(2,2,2)*dc[2]+P(2,2,3)*dc[3]);
+  dcP[11] = (P(2,3,0)*dc[0]+P(2,3,1)*dc[1]+P(2,3,2)*dc[2]+P(2,3,3)*dc[3]);
+  dcP[12] = (P(3,0,0)*dc[0]+P(3,0,1)*dc[1]+P(3,0,2)*dc[2]+P(3,0,3)*dc[3]);
+  dcP[13] = (P(3,1,0)*dc[0]+P(3,1,1)*dc[1]+P(3,1,2)*dc[2]+P(3,1,3)*dc[3]);
+  dcP[14] = (P(3,2,0)*dc[0]+P(3,2,1)*dc[1]+P(3,2,2)*dc[2]+P(3,2,3)*dc[3]);
+  dcP[15] = (P(3,3,0)*dc[0]+P(3,3,1)*dc[1]+P(3,3,2)*dc[2]+P(3,3,3)*dc[3]);
+
+  d2cP[ 0] = (P(0,0,0)*d2c[0]+P(0,0,1)*d2c[1]+P(0,0,2)*d2c[2]+P(0,0,3)*d2c[3]);
+  d2cP[ 1] = (P(0,1,0)*d2c[0]+P(0,1,1)*d2c[1]+P(0,1,2)*d2c[2]+P(0,1,3)*d2c[3]);
+  d2cP[ 2] = (P(0,2,0)*d2c[0]+P(0,2,1)*d2c[1]+P(0,2,2)*d2c[2]+P(0,2,3)*d2c[3]);
+  d2cP[ 3] = (P(0,3,0)*d2c[0]+P(0,3,1)*d2c[1]+P(0,3,2)*d2c[2]+P(0,3,3)*d2c[3]);
+  d2cP[ 4] = (P(1,0,0)*d2c[0]+P(1,0,1)*d2c[1]+P(1,0,2)*d2c[2]+P(1,0,3)*d2c[3]);
+  d2cP[ 5] = (P(1,1,0)*d2c[0]+P(1,1,1)*d2c[1]+P(1,1,2)*d2c[2]+P(1,1,3)*d2c[3]);
+  d2cP[ 6] = (P(1,2,0)*d2c[0]+P(1,2,1)*d2c[1]+P(1,2,2)*d2c[2]+P(1,2,3)*d2c[3]);
+  d2cP[ 7] = (P(1,3,0)*d2c[0]+P(1,3,1)*d2c[1]+P(1,3,2)*d2c[2]+P(1,3,3)*d2c[3]);
+  d2cP[ 8] = (P(2,0,0)*d2c[0]+P(2,0,1)*d2c[1]+P(2,0,2)*d2c[2]+P(2,0,3)*d2c[3]);
+  d2cP[ 9] = (P(2,1,0)*d2c[0]+P(2,1,1)*d2c[1]+P(2,1,2)*d2c[2]+P(2,1,3)*d2c[3]);
+  d2cP[10] = (P(2,2,0)*d2c[0]+P(2,2,1)*d2c[1]+P(2,2,2)*d2c[2]+P(2,2,3)*d2c[3]);
+  d2cP[11] = (P(2,3,0)*d2c[0]+P(2,3,1)*d2c[1]+P(2,3,2)*d2c[2]+P(2,3,3)*d2c[3]);
+  d2cP[12] = (P(3,0,0)*d2c[0]+P(3,0,1)*d2c[1]+P(3,0,2)*d2c[2]+P(3,0,3)*d2c[3]);
+  d2cP[13] = (P(3,1,0)*d2c[0]+P(3,1,1)*d2c[1]+P(3,1,2)*d2c[2]+P(3,1,3)*d2c[3]);
+  d2cP[14] = (P(3,2,0)*d2c[0]+P(3,2,1)*d2c[1]+P(3,2,2)*d2c[2]+P(3,2,3)*d2c[3]);
+  d2cP[15] = (P(3,3,0)*d2c[0]+P(3,3,1)*d2c[1]+P(3,3,2)*d2c[2]+P(3,3,3)*d2c[3]);
+
+  bcP[0] = ( b[0]*cP[ 0] + b[1]*cP[ 1] + b[2]*cP[ 2] + b[3]*cP[ 3]);
+  bcP[1] = ( b[0]*cP[ 4] + b[1]*cP[ 5] + b[2]*cP[ 6] + b[3]*cP[ 7]);
+  bcP[2] = ( b[0]*cP[ 8] + b[1]*cP[ 9] + b[2]*cP[10] + b[3]*cP[11]);
+  bcP[3] = ( b[0]*cP[12] + b[1]*cP[13] + b[2]*cP[14] + b[3]*cP[15]);
+
+  dbcP[0] = ( db[0]*cP[ 0] + db[1]*cP[ 1] + db[2]*cP[ 2] + db[3]*cP[ 3]);
+  dbcP[1] = ( db[0]*cP[ 4] + db[1]*cP[ 5] + db[2]*cP[ 6] + db[3]*cP[ 7]);
+  dbcP[2] = ( db[0]*cP[ 8] + db[1]*cP[ 9] + db[2]*cP[10] + db[3]*cP[11]);
+  dbcP[3] = ( db[0]*cP[12] + db[1]*cP[13] + db[2]*cP[14] + db[3]*cP[15]);
+
+  bdcP[0] = ( b[0]*dcP[ 0] + b[1]*dcP[ 1] + b[2]*dcP[ 2] + b[3]*dcP[ 3]);
+  bdcP[1] = ( b[0]*dcP[ 4] + b[1]*dcP[ 5] + b[2]*dcP[ 6] + b[3]*dcP[ 7]);
+  bdcP[2] = ( b[0]*dcP[ 8] + b[1]*dcP[ 9] + b[2]*dcP[10] + b[3]*dcP[11]);
+  bdcP[3] = ( b[0]*dcP[12] + b[1]*dcP[13] + b[2]*dcP[14] + b[3]*dcP[15]);
+
+  bd2cP[0] = ( b[0]*d2cP[ 0] + b[1]*d2cP[ 1] + b[2]*d2cP[ 2] + b[3]*d2cP[ 3]);
+  bd2cP[1] = ( b[0]*d2cP[ 4] + b[1]*d2cP[ 5] + b[2]*d2cP[ 6] + b[3]*d2cP[ 7]);
+  bd2cP[2] = ( b[0]*d2cP[ 8] + b[1]*d2cP[ 9] + b[2]*d2cP[10] + b[3]*d2cP[11]);
+  bd2cP[3] = ( b[0]*d2cP[12] + b[1]*d2cP[13] + b[2]*d2cP[14] + b[3]*d2cP[15]);
+
+  d2bcP[0] = ( d2b[0]*cP[ 0] + d2b[1]*cP[ 1] + d2b[2]*cP[ 2] + d2b[3]*cP[ 3]);
+  d2bcP[1] = ( d2b[0]*cP[ 4] + d2b[1]*cP[ 5] + d2b[2]*cP[ 6] + d2b[3]*cP[ 7]);
+  d2bcP[2] = ( d2b[0]*cP[ 8] + d2b[1]*cP[ 9] + d2b[2]*cP[10] + d2b[3]*cP[11]);
+  d2bcP[3] = ( d2b[0]*cP[12] + d2b[1]*cP[13] + d2b[2]*cP[14] + d2b[3]*cP[15]);
+  
+  dbdcP[0] = ( db[0]*dcP[ 0] + db[1]*dcP[ 1] + db[2]*dcP[ 2] + db[3]*dcP[ 3]);
+  dbdcP[1] = ( db[0]*dcP[ 4] + db[1]*dcP[ 5] + db[2]*dcP[ 6] + db[3]*dcP[ 7]);
+  dbdcP[2] = ( db[0]*dcP[ 8] + db[1]*dcP[ 9] + db[2]*dcP[10] + db[3]*dcP[11]);
+  dbdcP[3] = ( db[0]*dcP[12] + db[1]*dcP[13] + db[2]*dcP[14] + db[3]*dcP[15]);
+
+  *val = a[0]*bcP[0] + a[1]*bcP[1] + a[2]*bcP[2] + a[3]*bcP[3];
+  grad[0] = spline->x_grid.delta_inv *
+    (da[0] *bcP[0] + da[1]*bcP[1] + da[2]*bcP[2] + da[3]*bcP[3]);
+  grad[1] = spline->y_grid.delta_inv *
+    (a[0]*dbcP[0] + a[1]*dbcP[1] + a[2]*dbcP[2] + a[3]*dbcP[3]);
+  grad[2] = spline->z_grid.delta_inv *
+    (a[0]*bdcP[0] + a[1]*bdcP[1] + a[2]*bdcP[2] + a[3]*bdcP[3]);
+  // d2x
+  hess[0] = spline->x_grid.delta_inv * spline->x_grid.delta_inv *
+    (d2a[0]*bcP[0] + d2a[1]*bcP[1] + d2a[2]*bcP[2] + d2a[3]*bcP[3]);
+  // dx dy
+  hess[1] = spline->x_grid.delta_inv * spline->y_grid.delta_inv *
+    (da[0]*dbcP[0] + da[1]*dbcP[1] + da[2]*dbcP[2] + da[3]*dbcP[3]);
+  hess[3] = hess[1];
+  // dx dz;
+  hess[2] = spline->x_grid.delta_inv * spline->z_grid.delta_inv *
+    (da[0]*bdcP[0] + da[1]*bdcP[1] + da[2]*bdcP[2] + da[3]*bdcP[3]);
+  hess[6] = hess[2];
+  // d2y
+  hess[4] = spline->y_grid.delta_inv * spline->y_grid.delta_inv *
+    (a[0]*d2bcP[0] + a[1]*d2bcP[1] + a[2]*d2bcP[2] + a[3]*d2bcP[3]);
+  // dy dz
+  hess[5] = spline->y_grid.delta_inv * spline->z_grid.delta_inv *
+    (a[0]*dbdcP[0] + a[1]*dbdcP[1] + a[2]*dbdcP[2] + a[3]*dbdcP[3]);
+  hess[7] = hess[5];
+  // d2z
+  hess[8] = spline->z_grid.delta_inv * spline->z_grid.delta_inv *
+    (a[0]*bd2cP[0] + a[1]*bd2cP[1] + a[2]*bd2cP[2] + a[3]*bd2cP[3]);
+#undef P
+
+}
+
+#endif
--- a/src/einspline/bspline_structs.h
+++ b/src/einspline/bspline_structs.h
@ -0,0 +1,158 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef BSPLINE_STRUCTS_STD_H
+#define BSPLINE_STRUCTS_STD_H
+
+///////////////////////////
+// Single precision real //
+///////////////////////////
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  float* restrict coefs;
+  Ugrid x_grid;
+  BCtype_s xBC;
+} UBspline_1d_s;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  float* restrict coefs;
+  int x_stride;
+  Ugrid x_grid, y_grid;
+  BCtype_s xBC, yBC;
+} UBspline_2d_s;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  float* restrict coefs;
+  int x_stride, y_stride;
+  Ugrid x_grid, y_grid, z_grid;
+  BCtype_s xBC, yBC, zBC;
+} UBspline_3d_s;
+
+
+///////////////////////////
+// Double precision real //
+///////////////////////////
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  double* restrict coefs;
+  Ugrid x_grid;
+  BCtype_d xBC;
+} UBspline_1d_d;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  double* restrict coefs;
+  int x_stride;
+  Ugrid x_grid, y_grid;
+  BCtype_d xBC, yBC;
+} UBspline_2d_d;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  double* restrict coefs;
+  int x_stride, y_stride;
+  Ugrid x_grid, y_grid, z_grid;
+  BCtype_d xBC, yBC, zBC;
+} UBspline_3d_d;
+
+
+
+//////////////////////////////
+// Single precision complex //
+//////////////////////////////
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  complex_float* restrict coefs;
+  Ugrid x_grid;
+  BCtype_c xBC;
+} UBspline_1d_c;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  complex_float* restrict coefs;
+  int x_stride;
+  Ugrid x_grid, y_grid;
+  BCtype_c xBC, yBC;
+} UBspline_2d_c;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  complex_float* restrict coefs;
+  int x_stride, y_stride;
+  Ugrid x_grid, y_grid, z_grid;
+  BCtype_c xBC, yBC, zBC;
+
+} UBspline_3d_c;
+
+
+//////////////////////////////
+// Double precision complex //
+//////////////////////////////
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  complex_double* restrict coefs;
+  Ugrid x_grid;
+  BCtype_z xBC;
+} UBspline_1d_z;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  complex_double* restrict coefs;
+  int x_stride;
+  Ugrid x_grid, y_grid;
+  BCtype_z xBC, yBC;
+} UBspline_2d_z;
+
+typedef struct
+{
+  spline_code spcode;
+  type_code    tcode;
+  complex_double* restrict coefs;
+  int x_stride, y_stride;
+  Ugrid x_grid, y_grid, z_grid;
+  BCtype_z xBC, yBC, zBC;
+} UBspline_3d_z;
+
+
+#endif
--- a/src/einspline/bspline_structs_cuda.h
+++ b/src/einspline/bspline_structs_cuda.h
@ -0,0 +1,90 @@
+#ifndef BSPLINE_STRUCTS_CUDA_H
+#define BSPLINE_STRUCTS_CUDA_H
+
+#include "bspline_base_cuda.h"
+
+#define SPLINE_BLOCK_SIZE 64
+
+////////
+// 2D //
+////////
+
+#if CUDA_VERSION < 3000
+typedef struct
+{
+  double x,y,z;
+} double3;
+
+typedef struct
+{
+  double x,y,z,w;
+} double4;
+#endif
+
+typedef struct
+{
+  float *coefs;
+  uint2 stride;
+  float2 gridInv;
+} UBspline_2d_s_cuda;
+
+typedef struct
+{
+  float *coefs_real, *coefs_imag;
+  uint2 stride;
+  float2 gridInv;
+} UBspline_2d_c_cuda;
+
+typedef struct
+{
+  double *coefs;
+  uint2 stride;
+  double gridInv[2];
+} UBspline_2d_d_cuda;
+
+typedef struct
+{
+  complex_double *coefs;
+  uint2 stride;
+  double gridInv[2];
+} UBspline_2d_z_cuda;
+
+////////
+// 3D //
+////////
+
+typedef struct
+{
+  float *coefs;
+  uint3 stride;
+  float3 gridInv;
+  uint3 dim;
+} UBspline_3d_s_cuda;
+
+typedef struct
+{
+  complex_float *coefs;
+  uint3 stride;
+  float3 gridInv;
+  uint3 dim;
+} UBspline_3d_c_cuda;
+
+typedef struct
+{
+  double *coefs;
+  uint3 stride;
+  double3 gridInv;
+  uint3 dim;
+} UBspline_3d_d_cuda;
+
+typedef struct
+{
+  complex_double *coefs;
+  uint3 stride;
+  double3 gridInv;
+  uint3 dim;
+} UBspline_3d_z_cuda;
+
+
+
+#endif
--- a/src/einspline/config.h.cmake.in
+++ b/src/einspline/config.h.cmake.in
@ -0,0 +1,86 @@
+//
+//See the LICENSE file in the top-level directory for copyright notices
+//
+#ifndef EINSPLINE_CONFIGURATION_H
+#define EINSPLINE_CONFIGURATION_H
+
+/* Define to 1 if you have fftw */
+#cmakedefine HAVE_LIBFFTW @HAVE_LIBFFTW@
+
+/* Define if sincos function exists */
+#cmakedefine HAVE_SINCOS @HAVE_SINCOS@
+
+/* Define if std::round function exists */
+#cmakedefine HAVE_STD_ROUND @HAVE_STD_ROUND@
+
+/* Define if floor function exists */
+#cmakedefine HAVE_FLOOR @HAVE_FLOOR@
+
+/* Define if posix_memalign function exists */
+#cmakedefine HAVE_POSIX_MEMALIGN @HAVE_POSIX_MEMALIGN@
+
+/* Define if pow function exists */
+#cmakedefine HAVE_POW @HAVE_POW@
+
+/* Define if sqrt function exists */
+#cmakedefine HAVE_SQRT @HAVE_SQRT@
+
+/* Define if dlfcn.h exists */
+#cmakedefine HAVE_DLFCN_H @HAVE_DLFCN_H@
+
+/* Define if inttypes.h exists */
+#cmakedefine HAVE_INTTYPES_H @HAVE_INTTYPES_H@
+
+/* Define if memory.h exists */
+#cmakedefine HAVE_MEMORY_H @HAVE_MEMORY_H@
+
+/* Define if pmmintrin.h exists */
+#cmakedefine HAVE_PMMINTRIN_H @HAVE_PMMINTRIN_H@
+
+/* Define if emmintrin.h exists */
+#cmakedefine HAVE_EMMINTRIN_H @HAVE_EMMINTRIN_H@
+
+/* Define if sys/stat.h exists */
+#cmakedefine HAVE_SYS_STAT_H @HAVE_SYS_STAT_H@
+
+/* Define if sys/time.h exists */
+#cmakedefine HAVE_SYS_TIME_H @HAVE_SYS_TIME_H@
+
+/* Define if sys/types.h exists */
+#cmakedefine HAVE_SYS_TYPES_H @HAVE_SYS_TYPES_H@
+
+/* Define if unistd.h exists */
+#cmakedefine HAVE_UNISTD_H @HAVE_UNISTD_H@
+
+/* Define if mmx support exists */
+#cmakedefine HAVE_MMX @HAVE_MMX@
+
+/* Define if sse support exists */
+#cmakedefine HAVE_SSE @HAVE_SSE@
+
+/* Define if sse2 support exists */
+#cmakedefine HAVE_SSE2 @HAVE_SSE2@
+
+/* Define if sse3 support exists */
+#cmakedefine HAVE_SSE3 @HAVE_SSE3@
+
+/* Define if ssse3 support exists */
+#cmakedefine HAVE_SSSE3 @HAVE_SSSE3@
+
+/* Define if c variable array support exists */
+#cmakedefine HAVE_C_VARARRAYS @HAVE_C_VARARRAYS@
+
+/* Prefetch loop lead distance  */
+#cmakedefine PREFETCH_AHEAD @PREFETCH_AHEAD@
+
+/* Use SSE prefetch  */
+#cmakedefine USE_PREFETCH @USE_PREFETCH@
+
+///* Define to `__inline__' or `__inline' if that's what the C compiler
+//   calls it, or to nothing if 'inline' is not supported under any name.  */
+//#ifndef __cplusplus
+//#undef inline
+//#endif
+
+#endif 
+
--- a/src/einspline/cuda_walker.cu
+++ b/src/einspline/cuda_walker.cu
@ -0,0 +1,454 @@
+#include "cuda_walker.h"
+#include "determinant_update.h"
+#include <unistd.h>
+
+cuda_determinant::cuda_determinant() : 
+  N(0), A(NULL), Ainv(NULL), Ainv_delta(NULL), Ainv_colk(0),
+  new_row(NULL), delta(0)
+{
+
+};
+
+cuda_determinant::cuda_determinant(int n)
+{
+  resize(N);
+}
+
+void
+cuda_determinant::resize(int n)
+{
+  N = n;
+  cudaMalloc((void**)&A         , N*N*sizeof(float));
+  cudaMalloc((void**)&Ainv      , N*N*sizeof(float));
+  cudaMalloc((void**)&Ainv_delta, 1*N*sizeof(float));
+  cudaMalloc((void**)&Ainv_colk , 1*N*sizeof(float));
+  cudaMalloc((void**)&new_row   , 1*N*sizeof(float));
+  cudaMalloc((void**)&delta     , 1*N*sizeof(float));
+}
+
+void
+cuda_walker::resize(int nup, int ndown) 
+{
+  N[0] = nup; N[1] = ndown;
+  dets[0].resize(N[0]);
+  dets[1].resize(N[1]);
+}
+
+
+
+cuda_population::cuda_population() : MaxPop(1000)
+{
+  A_vec.resize(MaxPop);
+  Ainv_vec.resize(MaxPop);
+  delta_vec.resize(MaxPop);
+  Ainv_delta_vec.resize(MaxPop);
+  Ainv_colk_vec.resize(MaxPop);
+  ratio_vec.resize(MaxPop);
+  pos_vec.resize(3*MaxPop);
+
+
+  cudaMalloc((void**) &A_list_d,          MaxPop*sizeof(float*));
+  cudaMalloc((void**) &Ainv_list_d,       MaxPop*sizeof(float*));
+  cudaMalloc((void**) &Ainv_delta_list_d, MaxPop*sizeof(float*));
+  cudaMalloc((void**) &Ainv_colk_list_d,  MaxPop*sizeof(float*));
+  cudaMalloc((void**) &delta_list_d,      MaxPop*sizeof(float*));
+  cudaMalloc((void**) &ratios_d,          MaxPop*sizeof(float));
+  cudaMalloc((void**) &pos_d,           4*MaxPop*sizeof(float));
+}
+
+
+__global__ static void
+update_inverse_cuda1 (float *A_g[], float *Ainv_g[], float *u_g[], float *Ainv_delta_g[],
+		      float *Ainv_colk_g[], int N, int rowstride, int k);
+__global__ static void
+update_inverse_cuda2 (float *Ainv_g[], float *u_g[], float *Ainv_delta_g[],
+		      float *Ainv_colk_g[], int N, int rowstride, int k);
+
+
+void
+cuda_population::calc_new_row(int elec)
+{
+  int detnum = (elec < num_elecs[0]) ? 0 : 1;
+  int N = num_elecs[detnum];
+  for (int wi=0; wi<walkers.size(); wi++) {
+    cuda_walker &w = walkers[wi];
+    cuda_determinant &det = w.dets[detnum];
+    pos_vec[4*wi+0] = w.R[3*elec+0];
+    pos_vec[4*wi+1] = w.R[3*elec+1];
+    pos_vec[4*wi+2] = w.R[3*elec+2];
+    delta_vec[wi] = det.delta;
+  }
+  cudaMemcpy(pos_d, &(pos_vec[0]), 4*walkers.size()*sizeof(float), 
+	     cudaMemcpyHostToDevice);
+  cudaMemcpy(delta_list_d, &(delta_vec[0]), walkers.size()*sizeof(float*), 
+	     cudaMemcpyHostToDevice);
+
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid (N/SPLINE_BLOCK_SIZE, walkers.size());
+  
+  eval_multi_multi_UBspline_3d_s_cuda<<<dimGrid,dimBlock>>>
+    (pos_d, multi_spline->gridInv, multi_spline->coefs,
+     delta_list_d, multi_spline->stride);
+
+}
+
+
+void 
+cuda_population::update_determinants(int elec)
+{
+  int index=0;
+  int detnum = (elec < num_elecs[0]) ? 0 : 1;
+  int N = num_elecs[detnum];
+  int row = (elec < num_elecs[0]) ? elec : elec - num_elecs[0];
+  for (int wi=0; wi<walkers.size(); wi++) {
+    cuda_walker &w = walkers[wi];
+    cuda_determinant &det = w.dets[detnum];
+    if (w.accept) {
+      A_vec[index]          = det.A;
+      Ainv_vec[index]       = det.Ainv;
+      Ainv_delta_vec[index] = det.Ainv_delta;
+      Ainv_colk_vec[index]  = det.Ainv_colk;
+      delta_vec[index]      = det.delta;
+      index++;
+    }
+  }
+  int num_accept = index;
+
+  cudaMemcpy (A_list_d, &(A_vec[0]), 
+	      num_accept*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy (Ainv_list_d, &(Ainv_vec[0]), 
+	      num_accept*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy (Ainv_delta_list_d, &(Ainv_delta_vec[0]),
+	      num_accept*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy (Ainv_colk_list_d, &(Ainv_colk_vec[0]), 
+	      num_accept*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy (delta_list_d, &(delta_vec[0]), 
+	      num_accept*sizeof(float*), cudaMemcpyHostToDevice);
+
+  dim3 dimBlock(DET_BLOCK_SIZE);
+  dim3 dimGrid (N/DET_BLOCK_SIZE, num_accept);
+  
+  update_inverse_cuda1<<<dimGrid,dimBlock>>>
+    (A_list_d, Ainv_list_d, delta_list_d, Ainv_delta_list_d, 
+     Ainv_colk_list_d, N, N, row);
+  update_inverse_cuda2<<<dimGrid,dimBlock>>>
+    (Ainv_list_d, delta_list_d, Ainv_delta_list_d, 
+     Ainv_colk_list_d, N, N, row);
+};
+
+#define RATIO_BLOCK_SIZE 128
+
+__global__ void
+calc_ratios1 (float *Ainv_list[], float *new_row_list[],
+	      float *Ainv_tran, float *new_row_tran,
+	      int N, int k, int row_stride, int num_mats)
+{
+  int col = threadIdx.x + blockIdx.x*RATIO_BLOCK_SIZE;
+  if (col < num_mats) {
+    float* Ainv = Ainv_list[col];
+    float *new_row = new_row_list[col];
+    for (int row=0; row<N; row++) {
+      // __shared__ new_row_tran_shared[RATIO_BLOCK_SIZE];
+      // __shared__ Ainv_tran_shared[RATIO_BLOCK_SIZE];
+      new_row_tran[row_stride*row + col] = new_row[row];
+      Ainv_tran[row_stride*row+col] = Ainv[row*N + k];
+    }
+  }
+}
+
+
+__global__ void
+calc_ratios (float *Ainv_list[], float *new_row_list[], 
+	     float *ratio, int N, int row_stride, int elec)
+{
+  int tid = threadIdx.x;
+
+  int col = /*blockIdx.x*RATIO_BLOCK_SIZE * */tid;
+  __shared__ float *Ainv, *new_row;
+
+  if (col < N) {
+    if (tid == 0) {
+      Ainv = Ainv_list[blockIdx.x];
+      new_row = new_row_list[blockIdx.x];
+    }
+    __syncthreads();
+    __shared__ float new_row_shared[RATIO_BLOCK_SIZE];
+    
+    new_row_shared[tid] = new_row[tid];
+    
+    __shared__ float Ainv_colk_shared[RATIO_BLOCK_SIZE];
+    // This is *highly* uncoallesced, but we just have to eat it to allow
+    // other kernels to operate quickly.
+    Ainv_colk_shared[tid] = Ainv[col*row_stride + elec];
+    __syncthreads();
+
+    __shared__ float Ainv_new_row[RATIO_BLOCK_SIZE];
+    Ainv_new_row[tid] = Ainv_colk_shared[tid] * new_row_shared[tid];
+    
+    __syncthreads();
+    // Now, we have to dot
+    for (unsigned int s=RATIO_BLOCK_SIZE/2; s>0; s>>=1) {
+      if (tid < s)
+	Ainv_new_row[tid] += Ainv_new_row[tid + s];
+      __syncthreads();
+    }
+    if (tid == 0)      ratio[blockIdx.x] = Ainv_new_row[0];
+  }
+}
+
+
+__global__ void
+calc_ratios2 (float *Ainv_list[], float *new_row_list[], 
+	      float *ratio, int N, int row_stride, int elec)
+{
+  int tid = threadIdx.x;
+  __shared__ float *Ainv, *new_row;
+  if (tid == 0) {
+    Ainv = Ainv_list[blockIdx.x];
+    new_row = new_row_list[blockIdx.x];
+  }
+  __syncthreads();
+
+  int numBlocks = N/RATIO_BLOCK_SIZE;
+  float sum=0.0;
+  for (int block=0; block<numBlocks; block++) {
+    int row = block*RATIO_BLOCK_SIZE + tid;
+    __shared__ float new_row_shared[RATIO_BLOCK_SIZE];
+    new_row_shared[tid] = new_row[block*RATIO_BLOCK_SIZE+tid];
+    __syncthreads();
+    for (int i=0; i<RATIO_BLOCK_SIZE; i++) 
+      if (tid==0)
+	sum += Ainv[row*row_stride + elec] * new_row_shared[i];
+    
+  }
+  if (tid==0)
+    ratio[blockIdx.x] = sum;
+}
+
+extern "C" void 
+dgetrf_(int *m, int *n, double A[], int *lda, int ipiv[], int *info);
+
+double 
+Determinant (double *A, int N)
+{
+  double LU[N*N];
+  int ipiv[N];
+  int info;
+  for (int i=0; i<N*N; i++)
+    LU[i] = A[i];
+  // Do LU factorization
+  dgetrf_ (&N, &N, LU, &N, ipiv, &info);
+  double det = 1.0;
+  int numPerm = 0;
+  for (int i=0; i<N; i++) {
+    det *= LU[i*N+i];
+    numPerm += (ipiv[i] != (i+1));
+  }
+  if (numPerm & 1)
+    det *= -1.0;
+  
+  return det;
+}
+
+
+template<typename T> void 
+GJInverse (T *A, int n)
+{
+  const int maxSize = 2000;
+
+  if (n == 2) { // Special case for 2x2
+    T a=A[0]; T b=A[1];
+    T c=A[2]; T d=A[3];
+    T detInv = 1.0/(a*d-b*c);
+    A[0] = d*detInv;
+    A[1] = -b*detInv;
+    A[2] = -c*detInv;
+    A[3] =  a*detInv;
+    return;
+  }
+
+  int colIndex[maxSize], rowIndex[maxSize], ipiv[maxSize];
+  T big, pivInv;
+  int icol, irow;
+  
+  for (int j=0; j<n; j++)
+    ipiv[j] = -1;
+
+  for (int i=0; i<n; i++) {
+    big = 0.0;
+    for (int j=0; j<n; j++) 
+      if (ipiv[j] != 0)
+	for (int k=0; k<n; k++) {
+	  if (ipiv[k] == -1) {
+	    if (fabs(A[n*j+k]) >= big) {
+	      big = fabs(A[n*j+k]);
+	      irow = j; 
+	      icol = k;
+	    }
+	  }
+	  else if (ipiv[k] > 0) {
+	    fprintf (stderr, "GJInverse: Singular matrix!\n");
+	    exit(1);
+	  }
+	}
+    ++(ipiv[icol]); 
+    
+    if (irow != icol) 
+      for (int l=0; l<n; l++) {
+	T tmp = A[n*irow+l];
+	A[n*irow+l] = A[n*icol+l];
+	A[n*icol+l] = tmp;
+	// swap (A[n*irow+l], A[n*icol+l]);
+      }
+			     
+    
+    rowIndex[i] = irow;
+    colIndex[i] = icol;
+    if (A[n*icol+icol] == 0.0) { 
+      fprintf (stderr, "GJInverse: Singular matrix!\n");
+      exit(1);
+    }
+    pivInv = 1.0/A[n*icol+icol];
+    A[n*icol+icol] = 1.0;
+    for (int l=0; l<n; l++)
+      A[n*icol+l] *= pivInv;
+    for (int ll=0; ll<n; ll++)
+      if (ll != icol) {
+	T dum = A[n*ll+icol];
+	A[n*ll+icol] = 0.0;
+	for (int l=0; l<n; l++)
+	  A[n*ll+l] -= A[n*icol+l]*dum;
+      }
+  }
+  // Now unscramble the permutations
+  for (int l=n-1; l>=0; l--) {
+    if (rowIndex[l] != colIndex[l])
+      for (int k=0; k<n ; k++) {
+	T tmp = A[n*k+rowIndex[l]];
+	A[n*k+rowIndex[l]] = A[n*k+colIndex[l]];
+	A[n*k+colIndex[l]] = tmp;
+	// swap (A(k,rowIndex[l]),A(k, colIndex[l]));
+      }
+  }
+}
+
+
+
+
+void
+test_ratio()
+{
+  //const int N = RATIO_BLOCK_SIZE;
+  const int N = 128;
+  const int numWalkers = 1024;
+  const int elec = 15;
+  float **AinvList, **uList;
+  float **AinvList_d, **uList_d, *ratio_d;
+
+  AinvList = (float**) malloc(numWalkers*sizeof(float*));
+  uList    = (float**) malloc(numWalkers*sizeof(float*));
+
+  for (int i=0; i<numWalkers; i++) {
+    cudaMalloc((void**)&(AinvList[i]), N*N*sizeof(float));
+    cudaMalloc((void**)&(uList[i]),      N*sizeof(float));
+  }
+
+  fprintf (stderr, "N = %d\n", N);
+    
+  cudaMalloc((void**)&(AinvList_d), numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(uList_d),    numWalkers*sizeof(float*));
+  cudaMalloc((void**)&ratio_d,      numWalkers*sizeof(float));
+
+  cudaMemcpy (AinvList_d, AinvList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy (   uList_d,    uList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+
+  dim3 dimBlock(RATIO_BLOCK_SIZE);
+  dim3 dimGrid(numWalkers);
+
+  double *A   = (double*)malloc(N*N*sizeof(double));
+  float *Ainv = (float*) malloc(N*N*sizeof(float));
+  float *u    = (float*) malloc(N*sizeof(float));
+  for (int i=0; i<N; i++) {
+    u[i] = drand48();
+    for (int j=0; j<N; j++) 
+      A[i*N+j] = Ainv[i*N+j] = (float)drand48();
+  }
+
+  GJInverse(Ainv, N);
+  double det1 = Determinant (A, N);
+  for (int i=0; i<N; i++)
+    A[elec*N+i] = u[i];
+  double det2 = Determinant (A, N);
+  fprintf (stderr, "Host ratio = %1.8f\n", det2/det1);
+
+  for (int wi=0; wi<numWalkers; wi++) {
+    cudaMemcpy (AinvList[wi], Ainv, N*N*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy (uList[wi],       u, 1*N*sizeof(float), cudaMemcpyHostToDevice);
+  }
+
+  clock_t start = clock();
+  for (int i=0; i<10*N; i++) 
+    calc_ratios<<<dimGrid,dimBlock>>> (AinvList_d, uList_d, ratio_d, N, N, elec);
+  clock_t end = clock();
+
+  float ratio;
+  cudaMemcpy (&ratio, &(ratio_d[331]), sizeof(float), cudaMemcpyDeviceToHost);
+  fprintf (stderr, "Device ratio = %1.8f\n", ratio);
+
+  
+  double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  double rate = 10.0/time;
+  fprintf (stderr, "Rate = %1.3f generations per second.\n", rate);
+
+}
+
+
+void
+test_ratio1()
+{
+  const int N = 128;
+  const int numWalkers = 1024;
+  float **AinvList, **uList;
+  float **AinvList_d, **uList_d, *ratio_d;
+  float *Ainv_tran, *ratio_tran;
+
+  AinvList = (float**) malloc(numWalkers*sizeof(float*));
+  uList    = (float**) malloc(numWalkers*sizeof(float*));
+  cudaMalloc ((void**) &Ainv_tran, N*numWalkers);
+  cudaMalloc ((void**) &ratio_tran, N*numWalkers);
+
+  for (int i=0; i<numWalkers; i++) {
+    cudaMalloc((void**)&(AinvList[i]), N*N*sizeof(float));
+    cudaMalloc((void**)&(uList[i]),      N*sizeof(float));
+  }
+
+  fprintf (stderr, "N = %d\n", N);
+    
+  cudaMalloc((void**)&(AinvList_d), numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(uList_d),    numWalkers*sizeof(float*));
+  cudaMalloc((void**)&ratio_d,      numWalkers*sizeof(float));
+
+  cudaMemcpy (AinvList_d, AinvList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy (   uList_d,    uList, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+
+  dim3 dimBlock(RATIO_BLOCK_SIZE);
+  dim3 dimGrid(numWalkers/RATIO_BLOCK_SIZE);
+
+  clock_t start = clock();
+  for (int i=0; i<10*N; i++) 
+    calc_ratios1<<<dimGrid,dimBlock>>> (AinvList_d, uList_d, Ainv_tran, ratio_tran,
+					N, 1, numWalkers, numWalkers);
+  clock_t end = clock();
+  
+  double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  double rate = 10.0/time;
+  fprintf (stderr, "Rate = %1.3f generations per second.\n", rate);
+
+}
+
+
+
+main()
+{
+  test_ratio();
+}
--- a/src/einspline/cuda_walker.h
+++ b/src/einspline/cuda_walker.h
@ -0,0 +1,55 @@
+#ifndef CUDA_WALKER_H
+#define CUDA_WALKER_H
+
+#include <vector>
+#include "multi_bspline_cuda_s.h"
+
+class cuda_determinant
+{
+public:
+  int N;
+  float *A, *Atran, *Ainv;
+  float *Ainv_delta, *Ainv_colk;
+  float *new_row, *delta;
+  
+  void resize(int N);
+  cuda_determinant(int N);
+  cuda_determinant();
+};
+
+
+class cuda_walker
+{
+public:
+  int N[2];
+  float *R;
+  cuda_determinant dets[2];
+  bool accept;
+  void resize(int nup, int ndown);
+};
+
+
+class cuda_population
+{
+private:
+  const int MaxPop;
+  float **A_list_d, **Ainv_list_d, **delta_list_d;
+  float **Ainv_delta_list_d, **Ainv_colk_list_d;
+  float *ratios_d;
+  float *pos_d;
+  std::vector<float*> A_vec, Ainv_vec, delta_vec,
+    Ainv_delta_vec, Ainv_colk_vec;
+  std::vector<float> ratio_vec, pos_vec;
+  vector<cuda_walker> walkers;
+  // Number of up and down electrons
+  int num_elecs[2];
+  multi_UBspline_3d_s_cuda *multi_spline;
+public:
+  void calc_new_row (int elec);
+  void calc_ratios (int elec);
+  void update_determinants(int elec);
+
+  cuda_population();
+};
+
+#endif
--- a/src/einspline/determinant_update.cu
+++ b/src/einspline/determinant_update.cu
@ -0,0 +1,537 @@
+#define BLOCK_SIZE 64
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+
+// The first kernel just computes Ainv * u and also stores the kth
+// row of Ainv in global memory
+__global__ static void
+update_inverse_cuda1 (float *Ainv_g[], float *u_g[], float *AinvT_u_g[],
+		      float *Ainv_colk_g[], int N, int rowstride, int k)
+{
+  __shared__ float *Ainv, *u, *AinvT_u, *Ainv_colk;
+  if (threadIdx.x==0) {
+    Ainv     = Ainv_g[blockIdx.y];
+    u         = u_g[blockIdx.y];
+    AinvT_u    = AinvT_u_g[blockIdx.y];
+    Ainv_colk = Ainv_colk_g[blockIdx.y];
+  }
+
+  __syncthreads();
+
+  // Store the product Ainv * u in shared memory
+  __shared__ float AinvT_u_shared[BLOCK_SIZE], Ainv_colk_shared[BLOCK_SIZE];
+  __shared__ float u_shared[BLOCK_SIZE];
+  AinvT_u_shared[threadIdx.x] = 0.0;
+  int col = blockIdx.x*BLOCK_SIZE + threadIdx.x;
+  int numblocks = N / BLOCK_SIZE;
+
+  if (blockIdx.x*BLOCK_SIZE <= k && k < (blockIdx.x+1)*BLOCK_SIZE) {
+    for (int block=0; block<numblocks; block++) {
+      u_shared[threadIdx.x] = u[block*BLOCK_SIZE+threadIdx.x];
+      __syncthreads();
+      for (int i=0; i<BLOCK_SIZE; i++) {
+	int row = block*BLOCK_SIZE + i;
+	
+	float ainv = Ainv[row*rowstride+col];
+	if (col == k)
+	  Ainv_colk_shared[i] = ainv;
+	AinvT_u_shared[threadIdx.x] += ainv*u_shared[i];
+      }
+      __syncthreads();
+      Ainv_colk[block*BLOCK_SIZE+threadIdx.x] = Ainv_colk_shared[threadIdx.x];
+    }
+  }
+  else {
+    for (int block=0; block<numblocks; block++) {
+      u_shared[threadIdx.x] = u[block*BLOCK_SIZE+threadIdx.x];
+      __syncthreads();
+      for (int i=0; i<BLOCK_SIZE; i++) {
+	int row = block*BLOCK_SIZE + i;
+	AinvT_u_shared[threadIdx.x] += Ainv[row*rowstride+col]*u_shared[i];
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  // Write the data back to global memory
+  AinvT_u[col]    = AinvT_u_shared[threadIdx.x];
+}
+
+__global__ static void
+update_inverse_cuda2 (float *Ainv_g[], float *u_g[], float *AinvT_u_g[],
+		      float *Ainv_colk_g[], int N, int rowstride, int k)
+{
+  __shared__ float *Ainv, *AinvT_u, *Ainv_colk;
+  if (threadIdx.x==0) {
+    Ainv     = Ainv_g[blockIdx.y];
+    AinvT_u    = AinvT_u_g[blockIdx.y];
+    Ainv_colk = Ainv_colk_g[blockIdx.y];
+  }
+  __syncthreads();
+
+  __shared__ float AinvT_u_shared[BLOCK_SIZE];
+  __shared__ float  Ainv_colk_shared[BLOCK_SIZE];
+  int col = blockIdx.x*BLOCK_SIZE + threadIdx.x;
+  // Read the data back from global memory
+  AinvT_u_shared[threadIdx.x] = AinvT_u[col];
+  Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
+  __shared__ float prefact;
+  if (threadIdx.x == 0)
+    prefact = -1.0f/(1.0f+AinvT_u[k]);
+  __syncthreads();
+		   
+  int numblocks = N / BLOCK_SIZE;
+  for (int block=0; block<numblocks; block++) {
+    Ainv_colk_shared[threadIdx.x] = prefact*Ainv_colk[block*BLOCK_SIZE+threadIdx.x];
+    __syncthreads();
+    for (int i=0; i<BLOCK_SIZE; i++) {
+      int row = block*BLOCK_SIZE + i;
+      Ainv[row*rowstride+col] += AinvT_u_shared[threadIdx.x]*Ainv_colk_shared[i];
+    }
+  }
+}
+
+#define NMAX 128
+
+__global__ static void
+update_inverse_cuda (float *Ainv, float *u, int N, int rowstride, int k)
+{
+  __shared__ float A_k[NMAX], u_shared[NMAX], Ainv_u[NMAX], Ainv_shared[NMAX];
+  A_k[threadIdx.x] = Ainv[k*rowstride+threadIdx.x];
+  u_shared[threadIdx.x] = u[threadIdx.x];
+
+  // First, compute k'th element of Ainv_u
+  Ainv_u[threadIdx.x] = u_shared[threadIdx.x] * A_k[threadIdx.x];
+  __syncthreads();
+  for (int n=N>>1; n>0; n = n>>1) {
+    float a;
+    if (threadIdx.x < n) 
+      a = Ainv_u[2*threadIdx.x] + Ainv_u[2*threadIdx.x+1];
+    __syncthreads();
+    Ainv_u[threadIdx.x] = a;
+    __syncthreads();
+  }
+  float prefact = -1.0f/(1.0f + Ainv_u[0]);
+
+  for (int row=0; row<N; row++) {
+    Ainv_shared[threadIdx.x] = Ainv[row*rowstride+threadIdx.x];
+    __syncthreads();
+    Ainv_u[threadIdx.x] = u_shared[threadIdx.x] * Ainv_shared[threadIdx.x];
+    for (int n=N>>1; n>0; n = n>>1) {
+      float a;
+      if (threadIdx.x < n) 
+	a = Ainv_u[2*threadIdx.x] + Ainv_u[2*threadIdx.x+1];
+      __syncthreads();
+      Ainv_u[threadIdx.x] = a;
+      __syncthreads();
+    }
+    __syncthreads();
+    // Now Ainv_u[0] has the row'th element of Ainv_u.
+    Ainv[row*rowstride + threadIdx.x] = 
+      Ainv_shared[threadIdx.x] + prefact*Ainv_u[0]*A_k[threadIdx.x];
+  }
+
+}
+
+
+
+// __global__ static void
+// update_inverse_cuda (float *AinvT, float *u, int N, int rowstride, int k)
+// {
+//   // Store the product Ainv * u in shared memory
+//   __shared__ float Ainv_u[BLOCK_SIZE], Ainv_u_k[BLOCK_SIZE];
+//   Ainv_u[threadIdx.x] = 0.0;
+//   __syncthreads();
+
+//   for (int row=0; row < N; row++)
+//     Ainv_u[threadIdx.x] += AinvT[row*rowstride+threadIdx.x]*u[row];
+  
+//   // Compute lambda = [A^(-1)]_k dot u
+//   float lambda = 0.0;
+//   for (int i=0; i<N; i += BLOCK_SIZE) {
+//     Ainv_u_k[threadIdx.x] = AinvT[i+threadIdx.x] * u[i+threadIdx.x];
+//     __syncthreads();
+//     for (int j=BLOCK_SIZE>>1; j!=0; j >>=1) {
+//       if (threadIdx.x < j)
+// 	Ainv_u_k[threadIdx.x] = Ainv_u_k[2*threadIdx.x] + Ainv_u_k[2*threadIdx.x+1];
+//       lambda += Ainv_u_k[0];
+//     }
+//     float prefact = 1.0/(1.0+lambda);
+//   }
+
+//   // Now, subtract off outer product
+// }
+
+
+
+
+
+void
+update_inverse (float *AinvT, float *u, int N, int k)
+{
+  float Ainv_u[128], Ainv_rowk[128];
+  
+  for (int i=0; i<N; i++) {
+    Ainv_u[i] = 0.0f;
+    Ainv_rowk[i] = AinvT[N*i+k];
+    for (int j=0; j<N; j++)
+      Ainv_u[i] += AinvT[j*N+i] * u[j];
+  }
+
+  float prefact = 1.0/(1.0+Ainv_u[k]);
+
+  for (int i=0; i<N; i++)
+    for (int j=0; j<N; j++)
+      AinvT[j*N+i] -= prefact * Ainv_u[i]*Ainv_rowk[j];
+}
+
+
+
+// Replaces A with its inverse by gauss-jordan elimination with full pivoting
+// Adapted from Numerical Recipes in C
+void GJInverse (double *A, int n)
+{
+  const int maxSize = 2000;
+
+  if (n == 2) { // Special case for 2x2
+    double a=A[0]; double b=A[1];
+    double c=A[2]; double d=A[3];
+    double detInv = 1.0/(a*d-b*c);
+    A[0] = d*detInv;
+    A[1] = -b*detInv;
+    A[2] = -c*detInv;
+    A[3] =  a*detInv;
+    return;
+  }
+
+  int colIndex[maxSize], rowIndex[maxSize], ipiv[maxSize];
+  double big, pivInv;
+  int icol, irow;
+  
+  for (int j=0; j<n; j++)
+    ipiv[j] = -1;
+
+  for (int i=0; i<n; i++) {
+    big = 0.0;
+    for (int j=0; j<n; j++) 
+      if (ipiv[j] != 0)
+	for (int k=0; k<n; k++) {
+	  if (ipiv[k] == -1) {
+	    if (fabs(A[n*j+k]) >= big) {
+	      big = fabs(A[n*j+k]);
+	      irow = j; 
+	      icol = k;
+	    }
+	  }
+	  else if (ipiv[k] > 0) {
+	    fprintf (stderr, "GJInverse: Singular matrix!\n");
+	    exit(1);
+	  }
+	}
+    ++(ipiv[icol]); 
+    
+    if (irow != icol) 
+      for (int l=0; l<n; l++) {
+	double tmp = A[n*irow+l];
+	A[n*irow+l] = A[n*icol+l];
+	A[n*icol+l] = tmp;
+	// swap (A[n*irow+l], A[n*icol+l]);
+      }
+			     
+    
+    rowIndex[i] = irow;
+    colIndex[i] = icol;
+    if (A[n*icol+icol] == 0.0) { 
+      fprintf (stderr, "GJInverse: Singular matrix!\n");
+      exit(1);
+    }
+    pivInv = 1.0/A[n*icol+icol];
+    A[n*icol+icol] = 1.0;
+    for (int l=0; l<n; l++)
+      A[n*icol+l] *= pivInv;
+    for (int ll=0; ll<n; ll++)
+      if (ll != icol) {
+	double dum = A[n*ll+icol];
+	A[n*ll+icol] = 0.0;
+	for (int l=0; l<n; l++)
+	  A[n*ll+l] -= A[n*icol+l]*dum;
+      }
+  }
+  // Now unscramble the permutations
+  for (int l=n-1; l>=0; l--) {
+    if (rowIndex[l] != colIndex[l])
+      for (int k=0; k<n ; k++) {
+	double tmp = A[n*k+rowIndex[l]];
+	A[n*k+rowIndex[l]] = A[n*k+colIndex[l]];
+	A[n*k+colIndex[l]] = tmp;
+	// swap (A(k,rowIndex[l]),A(k, colIndex[l]));
+      }
+  }
+}
+
+
+#define MAT_SIZE 128
+#define NUM_MATS 1000
+
+main()
+{
+  int N = MAT_SIZE;
+  double *A, *Ainv;
+  int numMats = NUM_MATS;
+  float *Ainv_h, *u_h;
+  float *Ainv_d, *Ainv_u_d, *Ainv_colk_d, *u_d;
+
+
+  A       = (double*)malloc (N*N*sizeof(double));
+  Ainv    = (double*)malloc (N*N*sizeof(double));
+  Ainv_h  = (float*) malloc (N*N*sizeof(float));
+  u_h     = (float*) malloc (N*sizeof(float));
+  cudaMalloc((void**)&Ainv_d,  N*N*sizeof(float));
+  cudaMalloc((void**)&Ainv_d, N*N*sizeof(float));
+  cudaMalloc((void**)&u_d, N*sizeof(float));
+  cudaMalloc((void**)&Ainv_u_d, N*sizeof(float));
+  cudaMalloc((void**)&Ainv_colk_d, N*sizeof(float));
+  
+  float **AinvList, **Ainv_uList,   
+    **Ainv_colkList, **uList;
+
+  AinvList     = (float**)malloc(NUM_MATS*sizeof(float*));
+  Ainv_uList    = (float**)malloc(NUM_MATS*sizeof(float*));
+  Ainv_colkList = (float**)malloc(NUM_MATS*sizeof(float*));
+  uList         = (float**)malloc(NUM_MATS*sizeof(float*));
+
+  float **AinvList_d, **Ainv_uList_d, **Ainv_colkList_d, **uList_d;
+  cudaMalloc((void**)&AinvList_d,      numMats*sizeof(float*));
+  cudaMalloc((void**)&Ainv_uList_d,    numMats*sizeof(float*));
+  cudaMalloc((void**)&Ainv_colkList_d, numMats*sizeof(float*));
+  cudaMalloc((void**)&uList_d,         numMats*sizeof(float*));
+
+  fprintf (stderr, "N = %d\n", N);
+
+  
+  for (int mat=0; mat<numMats; mat++) {
+    cudaMalloc((void**)&(AinvList[mat])  ,   N*N*sizeof(float));
+    cudaMalloc((void**)&(Ainv_uList[mat]) ,   N*sizeof(float));
+    cudaMalloc((void**)&(Ainv_colkList[mat]), N*sizeof(float));
+    cudaMalloc((void**)&(uList[mat])        , N*sizeof(float));
+  }
+
+  fprintf (stderr, "N = %d\n", N);
+
+
+  cudaMemcpy (AinvList_d, AinvList, numMats*sizeof(float*), 
+	      cudaMemcpyHostToDevice);
+  cudaMemcpy (Ainv_uList_d, Ainv_uList, numMats*sizeof(float*), 
+	      cudaMemcpyHostToDevice);
+  cudaMemcpy (Ainv_colkList_d, Ainv_colkList, numMats*sizeof(float*), 
+	      cudaMemcpyHostToDevice);
+  cudaMemcpy (uList_d, uList, numMats*sizeof(float*), 
+	      cudaMemcpyHostToDevice);
+  
+  srand48((long int) 12341313);
+
+  fprintf (stderr, "N = %d\n", N);
+
+  for (int mat=0; mat<numMats; mat++) {
+    if (mat == 0 ) {
+      for (int i=0; i<N; i++) {
+	u_h[i] = drand48();
+	for (int j=0; j<N; j++) 
+	  A[i*N+j] = Ainv[i*N+j] = drand48();
+      }
+      GJInverse(Ainv, N);
+      for (int i=0; i<N; i++)
+	for (int j=0; j<N; j++) 
+	  Ainv_h[i*N+j] = (float)Ainv[i*N+j];
+    }
+
+    cudaMemcpy (AinvList[mat], Ainv_h, N*N*sizeof(float), 
+		cudaMemcpyHostToDevice);
+    cudaMemcpy (uList[mat], u_h, N*sizeof(float), cudaMemcpyHostToDevice);
+  }
+
+  dim3 dimBlock2(BLOCK_SIZE);
+  dim3 dimGrid2(N/BLOCK_SIZE, NUM_MATS);
+
+  int row = 1;
+
+
+  fprintf (stderr, "Before updates.\n");
+  clock_t upStart = clock();
+  for (int i=0; i<1; i++) {
+    update_inverse_cuda1<<<dimGrid2,dimBlock2>>>
+      (AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
+    update_inverse_cuda2<<<dimGrid2,dimBlock2>>>
+      (AinvList_d, uList_d, Ainv_uList_d, Ainv_colkList_d, N, N, row);
+  }
+  clock_t upEnd = clock();
+  double uptime = (double)(upEnd - upStart)/(double)CLOCKS_PER_SEC;
+  double uprate = (double)N*10*NUM_MATS/uptime;
+  fprintf (stderr, "%1.2f updates per second.\n", uprate);
+  fprintf (stderr, "%1.3f generations per second.\n", 10.0/uptime);
+
+  cudaMemcpy (Ainv_h, AinvList[1], N*N*sizeof(float),cudaMemcpyDeviceToHost);
+
+  for (int i=0; i<N; i++)
+    A[row*N+i] += u_h[i];
+  for (int i=0; i<N; i++)
+    for (int j=0; j<N; j++) {
+      double ident = 0.0;
+      for (int k=0; k<N; k++)
+  	ident += Ainv_h[i*N+k]*A[k*N+j];
+      if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
+  	  (i!=j && fabs(ident) > 1.0e-4))
+	fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
+    }
+  fprintf (stderr, "Finished.\n");
+
+
+//   cudaMemcpy (AinvT_h, AinvT_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);
+
+
+//   for (int i=0; i<N; i++) {
+//     u_h[i] = drand48();
+//     for (int j=0; j<N; j++) 
+//       A[i*N+j] = Ainv[i*N+j] = drand48();
+//   }
+  
+//   GJInverse(Ainv, N);
+
+//   for (int i=0; i<N; i++)
+//     for (int j=0; j<N; j++) {
+//       double ident = 0.0;
+//       for (int k=0; k<N; k++)
+// 	ident += Ainv[i*N+k]*A[k*N+j];
+//       if ((i==j && fabs(ident - 1.0) > 1.0e-8) ||
+// 	  (i!=j && fabs(ident) > 1.0e-8))
+// 	fprintf (stderr, "Error in matrix inverse.\n");
+//     }
+
+//   for (int i=0; i<N; i++)
+//     for (int j=0; j<N; j++) {
+//       AinvT_h[j*N+i] = (float)Ainv[i*N+j];
+//       Ainv_h[i*N+j]  = (float)Ainv[i*N+j];
+//     }
+
+//   cudaMemcpy (Ainv_d, Ainv_h, N*N*sizeof(float), cudaMemcpyHostToDevice);
+//   cudaMemcpy (AinvT_d, AinvT_h, N*N*sizeof(float), cudaMemcpyHostToDevice);
+//   cudaMemcpy (u_d, u_h, N*sizeof(float), cudaMemcpyHostToDevice);
+
+//   int col = 1;
+
+//   update_inverse (AinvT_h, u_h, N, col);
+
+//   for (int i=0; i<N; i++)
+//     A[i*N+col] += u_h[i];
+
+//   for (int i=0; i<N; i++)
+//     for (int j=0; j<N; j++) {
+//       double ident = 0.0;
+//       for (int k=0; k<N; k++)
+// 	ident += AinvT_h[k*N+i]*A[k*N+j];
+//       if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
+// 	  (i!=j && fabs(ident) > 1.0e-4))
+// 	fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
+//     }
+
+// //   clock_t host_start = clock();
+// //   for (int i=0; i<100000; i++) 
+// //     update_inverse (AinvT_h, u_h, N, col);
+// //   clock_t host_end = clock();
+// //   double host_time = (double)(host_end - host_start)/(double)(CLOCKS_PER_SEC);
+// //   double host_rate = 1.0e5/host_time;
+// //   fprintf (stderr, "Host rate = %1.8f updates per seconds.\n", host_rate);
+
+
+//   dim3 dimBlock2(BLOCK_SIZE);
+//   dim3 dimGrid2(N/BLOCK_SIZE);
+
+//   update_inverse_cuda1<<<dimGrid2,dimBlock2>>>
+//     (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
+//   update_inverse_cuda2<<<dimGrid2,dimBlock2>>>
+//     (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
+//     cudaMemcpy (AinvT_h, AinvT_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);
+
+//  fprintf (stderr, "2 kernel Device test:  ");
+//   bool passed = true;
+//   for (int i=0; i<N; i++)
+//     for (int j=0; j<N; j++) {
+//       double ident = 0.0;
+//       for (int k=0; k<N; k++)
+// 	ident += AinvT_h[k*N+i]*A[k*N+j];
+//       if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
+// 	  (i!=j && fabs(ident) > 1.0e-4)) {
+// 	fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
+// 	passed = false;
+//       }
+//     }
+//   if (passed)
+//     fprintf (stderr, "Passed.\n");
+//   else
+//     fprintf (stderr, "Failed.\n");
+
+
+//   dim3 dimBlock1(MAT_SIZE);
+//   dim3 dimGrid1(1);
+//   update_inverse_cuda<<<dimGrid1, dimBlock1>>> (Ainv_d, u_d, N, N, col);
+
+//   cudaMemcpy (Ainv_h, Ainv_d, N*N*sizeof(float), cudaMemcpyDeviceToHost);
+
+
+//   fprintf (stderr, "1-kernel Device test:  ");
+//   passed = true;
+//   for (int i=0; i<N; i++)
+//     for (int j=0; j<N; j++) {
+//       double ident = 0.0;
+//       for (int k=0; k<N; k++)
+// 	//ident += AinvT_h[k*N+i]*A[k*N+j];
+// 	ident += Ainv_h[i*N+k]*A[k*N+j];
+//       if ((i==j && fabs(ident - 1.0) > 1.0e-4) ||
+// 	  (i!=j && fabs(ident) > 1.0e-4)) {
+// 	fprintf (stderr, "Error in matrix inverse, (%d, %d) = %1.8f\n", i, j, ident);
+// 	passed = false;
+//       }
+//     }
+//   if (passed)
+//     fprintf (stderr, "Passed.\n");
+//   else
+//     fprintf (stderr, "Failed.\n");
+    
+//   dim3 dimGrid1000(1000);
+
+//   clock_t start = clock();
+//   for (int i=0; i<1000; i++)
+//     update_inverse_cuda<<<dimGrid1000,dimBlock1>>>
+//       (AinvT_d, u_d, N, N, col);
+//   clock_t end = clock();
+
+//   double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
+//   double rate = 1.0e6/time;
+
+//   fprintf (stderr, "Device rate = %1.8f updates per seconds.\n", rate);
+
+
+
+//   // dim3 dimGrid3(N/BLOCK_SIZE, 1000);
+//   // dim3 dimGrid4(N/BLOCK_SIZE, 1000);
+
+//   // clock_t start = clock();
+//   // for (int i=0; i<1000; i++) {
+//   //   update_inverse_cuda1<<<dimGrid3,dimBlock>>>
+//   //     (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
+//   //   update_inverse_cuda2<<<dimGrid4,dimBlock>>>
+//   //     (AinvT_d, u_d, Ainv_u_d, Ainv_rowk_d, N, N, col);
+//   // }
+//   // clock_t end = clock();
+
+//   // double time = (double)(end-start)/(double)CLOCKS_PER_SEC;
+//   // double rate = 1.0e6/time;
+
+//   // fprintf (stderr, "Device rate = %1.8f updates per seconds.\n", rate);
+
+
+}
--- a/src/einspline/determinant_update.h
+++ b/src/einspline/determinant_update.h
@ -0,0 +1,111 @@
+#define DET_BLOCK_SIZE 64
+
+#include <unistd.h>
+#include <stdlib.h>
+
+
+// The first kernel just computes AinvT * u and also stores the kth
+// col of Ainv in global memory
+__global__ static void
+update_inverse_cuda1 (float *A_g[], float *Ainv_g[], float *u_g[], 
+		      float *Ainv_delta_g[], float *Ainv_colk_g[], 
+		      int N, int rowstride, int k)
+{
+  __shared__ float *A, *Ainv, *u, *Ainv_delta, *Ainv_colk;
+  if (threadIdx.x==0) {
+    A        = A_g[blockIdx.y];
+    Ainv     = Ainv_g[blockIdx.y];
+    u         = u_g[blockIdx.y];
+    Ainv_delta    = Ainv_delta_g[blockIdx.y];
+    Ainv_colk = Ainv_colk_g[blockIdx.y];
+  }
+
+  __syncthreads();
+
+  // Store the product Ainv * u in shared memory
+  __shared__ float Ainv_delta_shared[DET_BLOCK_SIZE], 
+    Ainv_colk_shared[DET_BLOCK_SIZE], u_shared[DET_BLOCK_SIZE],
+    uold_shared[DET_BLOCK_SIZE];
+  Ainv_delta_shared[threadIdx.x] = 0.0;
+  int col = blockIdx.x*DET_BLOCK_SIZE + threadIdx.x;
+  int numblocks = N / DET_BLOCK_SIZE;
+
+  // If the column I need to pull from Ainv is in this thread block
+  // domain, do the following
+  if (blockIdx.x*DET_BLOCK_SIZE <= k && k < (blockIdx.x+1)*DET_BLOCK_SIZE) {
+    for (int block=0; block<numblocks; block++) {
+      u_shared[threadIdx.x] = u[block*DET_BLOCK_SIZE+threadIdx.x];
+      uold_shared[threadIdx.x] = 
+	A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x];
+      // Write new row into A matrix
+      A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x] = u_shared[threadIdx.x];
+      __syncthreads();
+      for (int i=0; i<DET_BLOCK_SIZE; i++) {
+	int row = block*DET_BLOCK_SIZE + i;
+	
+	float a = Ainv[row*rowstride+col];
+	if (col == k)
+	  Ainv_colk_shared[i] = a;
+	Ainv_delta_shared[threadIdx.x] += a*(u_shared[i]-uold_shared[i]);
+      }
+      __syncthreads();
+      Ainv_colk[block*DET_BLOCK_SIZE+threadIdx.x] = Ainv_colk_shared[threadIdx.x];
+    }
+  }
+  else {
+    for (int block=0; block<numblocks; block++) {
+      u_shared[threadIdx.x] = u[block*DET_BLOCK_SIZE+threadIdx.x];
+      uold_shared[threadIdx.x] = 
+	A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x];
+      // Write new row into A matrix
+      A[k*rowstride + block*DET_BLOCK_SIZE+threadIdx.x] = u_shared[threadIdx.x];
+      __syncthreads();
+      for (int i=0; i<DET_BLOCK_SIZE; i++) {
+	int row = block*DET_BLOCK_SIZE + i;
+	Ainv_delta_shared[threadIdx.x] += 
+	  Ainv[row*rowstride+col]*(u_shared[i]- uold_shared[i]);
+      }
+    }
+  }
+
+  __syncthreads();
+  
+  // Write the data back to global memory
+  Ainv_delta[col]    = Ainv_delta_shared[threadIdx.x];
+}
+
+__global__ static void
+update_inverse_cuda2 (float *Ainv_g[], float *u_g[], float *Ainv_delta_g[],
+		      float *Ainv_colk_g[], int N, int rowstride, int k)
+{
+  __shared__ float *Ainv, *Ainv_delta, *Ainv_colk;
+  if (threadIdx.x==0) {
+    Ainv     = Ainv_g[blockIdx.y];
+    Ainv_delta    = Ainv_delta_g[blockIdx.y];
+    Ainv_colk = Ainv_colk_g[blockIdx.y];
+  }
+  __syncthreads();
+
+  __shared__ float Ainv_delta_shared[DET_BLOCK_SIZE];
+  __shared__ float  Ainv_colk_shared[DET_BLOCK_SIZE];
+  int col = blockIdx.x*DET_BLOCK_SIZE + threadIdx.x;
+  // Read the data back from global memory
+  Ainv_delta_shared[threadIdx.x] = Ainv_delta[col];
+  Ainv_colk_shared[threadIdx.x] = Ainv_colk[col];
+  __shared__ float prefact;
+  if (threadIdx.x == 0)
+    prefact = -1.0f/(1.0f+Ainv_delta[k]);
+  __syncthreads();
+		   
+  int numblocks = N / DET_BLOCK_SIZE;
+  for (int block=0; block<numblocks; block++) {
+    Ainv_colk_shared[threadIdx.x] = 
+      prefact*Ainv_colk[block*DET_BLOCK_SIZE+threadIdx.x];
+    __syncthreads();
+    for (int i=0; i<DET_BLOCK_SIZE; i++) {
+      int row = block*DET_BLOCK_SIZE + i;
+      Ainv[row*rowstride+col] += 
+	Ainv_delta_shared[threadIdx.x]*Ainv_colk_shared[i];
+    }
+  }
+}
--- a/src/einspline/fbspline.c
+++ b/src/einspline/fbspline.c
@ -0,0 +1,911 @@
+#include "bspline_create.h"
+#include "bspline.h"
+#include "fbspline.h"
+#include "config.h"
+
+#ifdef __cplusplus
+#define CFUNC "C" /* Avoid name mangling in C++ */
+#else
+#define CFUNC
+#endif
+
+
+///////////////////////
+// Creation routines //
+///////////////////////
+
+////////
+// 1D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_ubspline_1d_s,FCREATE_UBSPLINE_1D_S)
+  (double *x0,   double    *x1, int   *num_x, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   float *data,  UBspline_1d_s **spline)
+{
+  Ugrid xgrid;
+  BCtype_s xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+
+  *spline = create_UBspline_1d_s (xgrid, xBC, data);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_ubspline_1d_d,FCREATE_UBSPLINE_1D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   double *data, UBspline_1d_d **spline)
+{
+  Ugrid xgrid;
+  BCtype_d xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+
+  *spline = create_UBspline_1d_d (xgrid, xBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_1d_c,FCREATE_UBSPLINE_1D_C)
+  (double *x0, double *x1, int *num_x, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   complex_float *data, UBspline_1d_c **spline)
+{
+  Ugrid xgrid;
+  BCtype_c xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+
+  *spline = create_UBspline_1d_c (xgrid, xBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_1d_z,FCREATE_UBSPLINE_1D_Z)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   complex_double *data, UBspline_1d_z **spline)
+{
+  Ugrid xgrid;
+  BCtype_z xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = creal(*x0_val);
+  xBC.lVal_i  = cimag(*x0_val);
+  xBC.rVal_r  = creal(*x1_val);
+  xBC.rVal_i  = cimag(*x1_val);
+
+  *spline = create_UBspline_1d_z (xgrid, xBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_s,FRECOMPUTE_UBSPLINE_1D_S)
+  (UBspline_1d_s **spline, float *data)
+{
+  recompute_UBspline_1d_s (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_d,FRECOMPUTE_UBSPLINE_1D_D)
+  (UBspline_1d_d **spline, double *data) 
+{
+  recompute_UBspline_1d_d (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_c,FRECOMPUTE_UBSPLINE_1D_C)
+  (UBspline_1d_c **spline, complex_float *data)
+{
+  recompute_UBspline_1d_c (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_z,FRECOMPUTE_UBSPLINE_1D_Z)
+  (UBspline_1d_z **spline, complex_double *data) 
+{
+  recompute_UBspline_1d_z (*spline, data);
+}
+
+////////
+// 2D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_s,FCREATE_UBSPLINE_2D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   float *data, UBspline_2d_s **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_s xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  *spline = create_UBspline_2d_s (xgrid, ygrid, xBC, yBC, data);
+}
+
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_d,FCREATE_UBSPLINE_2D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   double *data, UBspline_2d_d **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_d xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  *spline = create_UBspline_2d_d (xgrid, ygrid, xBC, yBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_c,FCREATE_UBSPLINE_2D_C)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   complex_float *data, UBspline_2d_c **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_c xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = crealf(*y0_val);
+  yBC.lVal_i  = cimagf(*y0_val);
+  yBC.rVal_r  = crealf(*y1_val);
+  yBC.rVal_i  = cimagf(*y1_val);
+
+  *spline = create_UBspline_2d_c (xgrid, ygrid, xBC, yBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_z,FCREATE_UBSPLINE_2D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   complex_double *data, UBspline_2d_z **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_z xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = creal(*y0_val);
+  yBC.lVal_i  = cimag(*y0_val);
+  yBC.rVal_r  = creal(*y1_val);
+  yBC.rVal_i  = cimag(*y1_val);
+
+  *spline = create_UBspline_2d_z (xgrid, ygrid, xBC, yBC, data);
+}
+
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_s,FRECOMPUTE_UBSPLINE_2D_S)
+  (UBspline_2d_s **spline, float *data)
+{
+  recompute_UBspline_2d_s (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_d,FRECOMPUTE_UBSPLINE_2D_D)
+  (UBspline_2d_d **spline, double *data) 
+{
+  recompute_UBspline_2d_d (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_c,FRECOMPUTE_UBSPLINE_2D_C)
+  (UBspline_2d_c **spline, complex_float *data)
+{
+  recompute_UBspline_2d_c (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_z,FRECOMPUTE_UBSPLINE_2D_Z)
+  (UBspline_2d_z **spline, complex_double *data) 
+{
+  recompute_UBspline_2d_z (*spline, data);
+}
+
+
+////////
+// 3D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_s,FCREATE_UBSPLINE_3D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   double   *z0, double    *z1, int   *num_z, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int *z0_code, float *z0_val, int *z1_code, float *z1_val,
+   float *data, UBspline_3d_s **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_s xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal  = *z0_val;
+  zBC.rVal  = *z1_val;
+  *spline = create_UBspline_3d_s (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_d,FCREATE_UBSPLINE_3D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   double   *z0, double     *z1, int   *num_z, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int *z0_code, double *z0_val, int *z1_code, double *z1_val,
+   double *data, UBspline_3d_d **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_d xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal  = *z0_val;
+  zBC.rVal  = *z1_val;
+  *spline = create_UBspline_3d_d (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_c,FCREATE_UBSPLINE_3D_C)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
+   complex_float *data, UBspline_3d_c **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_c xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = crealf(*y0_val);
+  yBC.lVal_i  = cimagf(*y0_val);
+  yBC.rVal_r  = crealf(*y1_val);
+  yBC.rVal_i  = cimagf(*y1_val);
+
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal_r  = crealf(*z0_val);
+  zBC.lVal_i  = cimagf(*z0_val);
+  zBC.rVal_r  = crealf(*z1_val);
+  zBC.rVal_i  = cimagf(*z1_val);
+
+  *spline = create_UBspline_3d_c (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_z,FCREATE_UBSPLINE_3D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code,  complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code,  complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   int *z0_code,  complex_double *z0_val, int *z1_code, complex_double *z1_val,
+   complex_double *data, UBspline_3d_z **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_z xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = creal(*x0_val);
+  xBC.lVal_i  = cimag(*x0_val);
+  xBC.rVal_r  = creal(*x1_val);
+  xBC.rVal_i  = cimag(*x1_val);
+
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = creal(*y0_val);
+  yBC.lVal_i  = cimag(*y0_val);
+  yBC.rVal_r  = creal(*y1_val);
+  yBC.rVal_i  = cimag(*y1_val);
+
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal_r  = creal(*z0_val);
+  zBC.lVal_i  = cimag(*z0_val);
+  zBC.rVal_r  = creal(*z1_val);
+  zBC.rVal_i  = cimag(*z1_val);
+
+  *spline = create_UBspline_3d_z (xgrid, ygrid, zgrid, xBC, yBC, zBC, data);
+}
+
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_s,FRECOMPUTE_UBSPLINE_3D_S)
+  (UBspline_3d_s **spline, float *data)
+{
+  recompute_UBspline_3d_s (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_d,FRECOMPUTE_UBSPLINE_3D_D)
+  (UBspline_3d_d **spline, double *data) 
+{
+  recompute_UBspline_3d_d (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_c,FRECOMPUTE_UBSPLINE_3D_C)
+  (UBspline_3d_c **spline, complex_float *data)
+{
+  recompute_UBspline_3d_c (*spline, data);
+}
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_z,FRECOMPUTE_UBSPLINE_3D_Z)
+  (UBspline_3d_z **spline, complex_double *data) 
+{
+  recompute_UBspline_3d_z (*spline, data);
+}
+
+
+
+CFUNC void
+F77_FUNC_(fdestroy_bspline,FDESTROY_BSPLINE)
+  (Bspline **spline)
+{
+  destroy_Bspline (*spline);
+}
+/////////////////////////
+// Evaluation routines //
+/////////////////////////
+
+//////////////////////////////
+// 1D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s,FEVAL_UBSPLINE_1D_S)
+  (UBspline_1d_s **spline, double *x, float *val)
+{
+  eval_UBspline_1d_s (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vg,FEVAL_UBSPLINE_1D_S_VG)
+  (UBspline_1d_s **spline, double *x, float *val, float *grad)
+{
+  eval_UBspline_1d_s_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vgl,FEVAL_UBSPLINE_1D_S_VGL)
+  (UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *lapl)
+{
+  eval_UBspline_1d_s_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vgh,FEVAL_UBSPLINE_1D_S_VGH)
+  (UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *hess)
+{
+  eval_UBspline_1d_s_vgh (*spline, *x, val, grad, hess);
+}
+
+//////////////////////////////
+// 1D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d,FEVAL_UBSPLINE_1D_D)
+  (UBspline_1d_d **spline, double *x, double *val)
+{
+  eval_UBspline_1d_d (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d_vg,FEVAL_UBSPLINE_1D_D_VG)
+  (UBspline_1d_d **spline, double *x, 
+   double *val, double *grad)
+{
+  eval_UBspline_1d_d_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d_vgl,FEVAL_UBSPLINE_1D_D_VGL)
+  (UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *lapl)
+{
+  eval_UBspline_1d_d_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d_vgh,FEVAL_UBSPLINE_1D_D_VGH)
+  (UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *hess)
+{
+  eval_UBspline_1d_d_vgh (*spline, *x, val, grad, hess);
+}
+
+/////////////////////////////////
+// 1D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c,FEVAL_UBSPLINE_1D_C)
+  (UBspline_1d_c **spline, double *x, complex_float *val)
+{
+  eval_UBspline_1d_c (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c_vg,FEVAL_UBSPLINE_1D_C_VG)
+  (UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad)
+{
+  eval_UBspline_1d_c_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c_vgl,FEVAL_UBSPLINE_1D_C_VGL)
+  (UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_UBspline_1d_c_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c_vgh,FEVAL_UBSPLINE_1D_C_VGH)
+  (UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_UBspline_1d_c_vgh (*spline, *x, val, grad, hess);
+}
+
+/////////////////////////////////
+// 1D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z,FEVAL_UBSPLINE_1D_Z)
+  (UBspline_1d_z **spline, double *x, complex_double *val)
+{
+  eval_UBspline_1d_z (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z_vg,FEVAL_UBSPLINE_1D_Z_VG)
+  (UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad)
+{
+  eval_UBspline_1d_z_vg (*spline, *x, val, grad);
+}
+ 
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z_vgl,FEVAL_UBSPLINE_1D_Z_VGL)
+  (UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_UBspline_1d_z_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z_vgh,FEVAL_UBSPLINE_1D_Z_VGH)
+  (UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_UBspline_1d_z_vgh (*spline, *x, val, grad, hess);
+}
+
+//////////////////////////////
+// 2D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s,FEVAL_UBSPLINE_2D_S)
+  (UBspline_2d_s **spline, double *x, double *y, float *val)
+{
+  eval_UBspline_2d_s (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s_vg,FEVAL_UBSPLINE_2D_S_VG)
+  (UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad)
+{
+  eval_UBspline_2d_s_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s_vgl,FEVAL_UBSPLINE_2D_S_VGL)
+  (UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float* lapl)
+{
+  eval_UBspline_2d_s_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s_vgh,FEVAL_UBSPLINE_2D_S_VGH)
+  (UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float *hess)
+{
+  eval_UBspline_2d_s_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+//////////////////////////////
+// 2D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d,FEVAL_UBSPLINE_2D_D)
+  (UBspline_2d_d **spline, double *x, double *y, double *val)
+{
+  eval_UBspline_2d_d (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d_vg,FEVAL_UBSPLINE_2D_D_VG)
+  (UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad)
+{
+  eval_UBspline_2d_d_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d_vgl,FEVAL_UBSPLINE_2D_D_VGL)
+  (UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *lapl)
+{
+  eval_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d_vgh,FEVAL_UBSPLINE_2D_D_VGH)
+  (UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *hess)
+{
+  eval_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, hess);
+}
+
+/////////////////////////////////
+// 2D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c,FEVAL_UBSPLINE_2D_C)
+  (UBspline_2d_c **spline, double *x, double *y, complex_float *val)
+{
+  eval_UBspline_2d_c (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c_vg,FEVAL_UBSPLINE_2D_C_VG)
+  (UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad)
+{
+  eval_UBspline_2d_c_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c_vgl,FEVAL_UBSPLINE_2D_C_VGL)
+  (UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_UBspline_2d_c_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c_vgh,FEVAL_UBSPLINE_2D_C_VGH)
+  (UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_UBspline_2d_c_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+/////////////////////////////////
+// 2D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z,FEVAL_UBSPLINE_2D_Z)
+  (UBspline_2d_z **spline, double *x, double *y, complex_double *val)
+{
+  eval_UBspline_2d_z (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z_vg,FEVAL_UBSPLINE_2D_Z_VG)
+  (UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad)
+{
+  eval_UBspline_2d_z_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z_vgl,FEVAL_UBSPLINE_2D_Z_VGL)
+  (UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_UBspline_2d_z_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z_vgh,FEVAL_UBSPLINE_2D_Z_VGH)
+  (UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_UBspline_2d_z_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+
+
+//////////////////////////////
+// 3D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s,FEVAL_UBSPLINE_3D_S)
+  (UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val)
+{
+  eval_UBspline_3d_s (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s_vg,FEVAL_UBSPLINE_3D_S_VG)
+  (UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad)
+{
+  eval_UBspline_3d_s_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s_vgl,FEVAL_UBSPLINE_3D_S_VGL)
+  (UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad, float* lapl)
+{
+  eval_UBspline_3d_s_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s_vgh,FEVAL_UBSPLINE_3D_S_VGH)
+  (UBspline_3d_s **spline, double *x, double *y, double *z, 
+   float *val, float *grad, float *hess)
+{
+  eval_UBspline_3d_s_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+//////////////////////////////
+// 3D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d,FEVAL_UBSPLINE_3D_D)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val)
+{
+  eval_UBspline_3d_d (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d_vg,FEVAL_UBSPLINE_3D_D_VG)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad)
+{
+  eval_UBspline_3d_d_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d_vgl,FEVAL_UBSPLINE_3D_D_VGL)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,  
+   double *val, double *grad, double *lapl)
+{
+  eval_UBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d_vgh,FEVAL_UBSPLINE_3D_D_VGH)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad, double *hess)
+{
+  eval_UBspline_3d_d_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+/////////////////////////////////
+// 3D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c,FEVAL_UBSPLINE_3D_C)
+  (UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val)
+{
+  eval_UBspline_3d_c (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c_vg,FEVAL_UBSPLINE_3D_C_VG)
+  (UBspline_3d_c **spline, double *x, double *y, double *z, 
+   complex_float *val, complex_float *grad)
+{
+  eval_UBspline_3d_c_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c_vgl,FEVAL_UBSPLINE_3D_C_VGL)
+  (UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_UBspline_3d_c_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c_vgh,FEVAL_UBSPLINE_3D_C_VGH)
+  (UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_UBspline_3d_c_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+/////////////////////////////////
+// 3D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z,FEVAL_UBSPLINE_3D_Z)
+  (UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val)
+{
+  eval_UBspline_3d_z (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z_vg,FEVAL_UBSPLINE_3D_Z_VG)
+  (UBspline_3d_z **spline, double *x, double *y, double *z, 
+   complex_double *val, complex_double *grad)
+{
+  eval_UBspline_3d_z_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z_vgl,FEVAL_UBSPLINE_3D_Z_VGL)
+  (UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_UBspline_3d_z_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z_vgh,FEVAL_UBSPLINE_3D_Z_VGH)
+  (UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_UBspline_3d_z_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+
--- a/src/einspline/fbspline.h
+++ b/src/einspline/fbspline.h
@ -0,0 +1,440 @@
+#ifndef F_BSPLINE_H
+#define F_BSPLINE_H
+
+#include "config.h"
+#include "bspline_base.h"
+#include "bspline_create.h"
+
+#ifdef __cplusplus
+#define CFUNC extern "C" /* Avoid name mangling in C++ */
+#else
+#define CFUNC
+#endif
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                       Creation routines                      ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+////////
+// 1D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_1d_s,FCREATE_UBSPLINE_1D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   float *data, UBspline_1d_s **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_1d_d,FCREATE_UBSPLINE_1D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   double *data, UBspline_1d_d **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_1d_c,FCREATE_UBSPLINE_1D_C)
+  (double   *x0, double    *x1, int   *num_x, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   complex_float *data, UBspline_1d_c **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_1d_z,FCREATE_UBSPLINE_1D_Z)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   complex_double *data, UBspline_1d_z **spline);
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_s,FRECOMPUTE_UBSPLINE_1D_S)
+  (UBspline_1d_s **spline, float *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_d,FRECOMPUTE_UBSPLINE_1D_D)
+  (UBspline_1d_d **spline, double *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_c,FRECOMPUTE_UBSPLINE_1D_C)
+  (UBspline_1d_c **spline, complex_float *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_1d_z,FRECOMPUTE_UBSPLINE_1D_Z)
+  (UBspline_1d_z **spline, complex_double *data);
+
+////////
+// 2D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_s,FCREATE_UBSPLINE_2D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   float *data, UBspline_2d_s **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_d,FCREATE_UBSPLINE_2D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   double *data, UBspline_2d_d **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_c,FCREATE_UBSPLINE_2D_C)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   complex_float *data, UBspline_2d_c **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_2d_z,FCREATE_UBSPLINE_2D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   complex_double *data, UBspline_2d_z **spline);
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_s,FRECOMPUTE_UBSPLINE_2D_S)
+  (UBspline_2d_s **spline, float *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_d,FRECOMPUTE_UBSPLINE_2D_D)
+  (UBspline_2d_d **spline, double *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_c,FRECOMPUTE_UBSPLINE_2D_C)
+  (UBspline_2d_c **spline, complex_float *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_2d_z,FRECOMPUTE_UBSPLINE_2D_Z)
+  (UBspline_2d_z **spline, complex_double *data);
+
+////////
+// 3D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_s,FCREATE_UBSPLINE_3D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   double   *z0, double    *z1, int   *num_z, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int *z0_code, float *z0_val, int *z1_code, float *z1_val,
+   float *data, UBspline_3d_s **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_d,FCREATE_UBSPLINE_3D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   double   *z0, double     *z1, int   *num_z, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int *z0_code, double *z0_val, int *z1_code, double *z1_val,
+   double *data, UBspline_3d_d **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_c,FCREATE_UBSPLINE_3D_C)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
+   complex_float *data, UBspline_3d_c **spline);
+CFUNC void 
+F77_FUNC_(fcreate_ubspline_3d_z,FCREATE_UBSPLINE_3D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code,  complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code,  complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   int *z0_code,  complex_double *z0_val, int *z1_code, complex_double *z1_val,
+   complex_double *data, UBspline_3d_z **spline);
+
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_s,FRECOMPUTE_UBSPLINE_3D_S)
+  (UBspline_3d_s **spline, float *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_d,FRECOMPUTE_UBSPLINE_3D_D)
+  (UBspline_3d_d **spline, double *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_c,FRECOMPUTE_UBSPLINE_3D_C)
+  (UBspline_3d_c **spline, complex_float *data);
+CFUNC void 
+F77_FUNC_(frecompute_ubspline_3d_z,FRECOMPUTE_UBSPLINE_3D_Z)
+  (UBspline_3d_z **spline, complex_double *data);
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                      Destruction routine                     ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+CFUNC void
+F77_FUNC_(fdestroy_bspline,FDESTROY_BSPLINE)
+  (Bspline **spline);
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                      Evaluation routines                     ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+//////////////////////////////
+// 1D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s,FEVAL_UBSPLINE_1D_S)
+  (UBspline_1d_s **spline, double *x, float *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vg,FEVAL_UBSPLINE_1D_S_VG)
+  (UBspline_1d_s **spline, double *x, float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vgl,FEVAL_UBSPLINE_1D_S_VGL)
+  (UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vgh,FEVAL_UBSPLINE_1D_S_VGH)
+  (UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 1D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d,FEVAL_UBSPLINE_1D_D)
+  (UBspline_1d_d **spline, double *x, double *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d_vg,FEVAL_UBSPLINE_1D_D_VG)
+  (UBspline_1d_d **spline, double *x, 
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d_vgl,FEVAL_UBSPLINE_1D_D_VGL)
+  (UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_d_vgh,FEVAL_UBSPLINE_1D_D_VGH)
+  (UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 1D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c,FEVAL_UBSPLINE_1D_C)
+  (UBspline_1d_c **spline, double *x, complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c_vg,FEVAL_UBSPLINE_1D_C_VG)
+  (UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c_vgl,FEVAL_UBSPLINE_1D_C_VGL)
+  (UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_c_vgh,FEVAL_UBSPLINE_1D_C_VGH)
+  (UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 1D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z,FEVAL_UBSPLINE_1D_Z)
+  (UBspline_1d_z **spline, double *x, complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z_vg,FEVAL_UBSPLINE_1D_Z_VG)
+  (UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad);
+ 
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z_vgl,FEVAL_UBSPLINE_1D_Z_VGL)
+  (UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_z_vgh,FEVAL_UBSPLINE_1D_Z_VGH)
+  (UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+//////////////////////////////
+// 2D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s,FEVAL_UBSPLINE_2D_S)
+  (UBspline_2d_s **spline, double *x, double *y, float *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s_vg,FEVAL_UBSPLINE_2D_S_VG)
+  (UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s_vgl,FEVAL_UBSPLINE_2D_S_VGL)
+  (UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float* lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_s_vgh,FEVAL_UBSPLINE_2D_S_VGH)
+  (UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 2D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d,FEVAL_UBSPLINE_2D_D)
+  (UBspline_2d_d **spline, double *x, double *y, double *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d_vg,FEVAL_UBSPLINE_2D_D_VG)
+  (UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d_vgl,FEVAL_UBSPLINE_2D_D_VGL)
+  (UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_d_vgh,FEVAL_UBSPLINE_2D_D_VGH)
+  (UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 2D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c,FEVAL_UBSPLINE_2D_C)
+  (UBspline_2d_c **spline, double *x, double *y, complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c_vg,FEVAL_UBSPLINE_2D_C_VG)
+  (UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c_vgl,FEVAL_UBSPLINE_2D_C_VGL)
+  (UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_c_vgh,FEVAL_UBSPLINE_2D_C_VGH)
+  (UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 2D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z,FEVAL_UBSPLINE_2D_Z)
+  (UBspline_2d_z **spline, double *x, double *y, complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z_vg,FEVAL_UBSPLINE_2D_Z_VG)
+  (UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z_vgl,FEVAL_UBSPLINE_2D_Z_VGL)
+  (UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_2d_z_vgh,FEVAL_UBSPLINE_2D_Z_VGH)
+  (UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+
+//////////////////////////////
+// 3D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s,FEVAL_UBSPLINE_3D_S)
+  (UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s_vg,FEVAL_UBSPLINE_3D_S_VG)
+  (UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s_vgl,FEVAL_UBSPLINE_3D_S_VGL)
+  (UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad, float* lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_s_vgh,FEVAL_UBSPLINE_3D_S_VGH)
+  (UBspline_3d_s **spline, double *x, double *y, double *z, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 3D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d,FEVAL_UBSPLINE_3D_D)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d_vg,FEVAL_UBSPLINE_3D_D_VG)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d_vgl,FEVAL_UBSPLINE_3D_D_VGL)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,  
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_d_vgh,FEVAL_UBSPLINE_3D_D_VGH)
+  (UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 3D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c,FEVAL_UBSPLINE_3D_C)
+  (UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c_vg,FEVAL_UBSPLINE_3D_C_VG)
+  (UBspline_3d_c **spline, double *x, double *y, double *z, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c_vgl,FEVAL_UBSPLINE_3D_C_VGL)
+  (UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_c_vgh,FEVAL_UBSPLINE_3D_C_VGH)
+  (UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 3D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z,FEVAL_UBSPLINE_3D_Z)
+  (UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z_vg,FEVAL_UBSPLINE_3D_Z_VG)
+  (UBspline_3d_z **spline, double *x, double *y, double *z, 
+   complex_double *val, complex_double *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z_vgl,FEVAL_UBSPLINE_3D_Z_VGL)
+  (UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_3d_z_vgh,FEVAL_UBSPLINE_3D_Z_VGH)
+  (UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+
+#undef CFUNC
+#endif
--- a/src/einspline/fmulti_bspline.c
+++ b/src/einspline/fmulti_bspline.c
@ -0,0 +1,908 @@
+#include "multi_bspline_create.h"
+#include "multi_bspline.h"
+#include "fmulti_bspline.h"
+#include "config.h"
+
+#ifdef __cplusplus
+#define CFUNC "C" /* Avoid name mangling in C++ */
+#else
+#define CFUNC
+#endif
+
+
+///////////////////////
+// Creation routines //
+///////////////////////
+
+////////
+// 1D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_multi_ubspline_1d_s,FCREATE_MULTI_UBSPLINE_1D_S)
+  (double *x0,   double    *x1, int   *num_x, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *num_splines,  multi_UBspline_1d_s **spline)
+{
+  Ugrid xgrid;
+  BCtype_s xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+
+  *spline = create_multi_UBspline_1d_s (xgrid, xBC, *num_splines);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_multi_ubspline_1d_d,FCREATE_MULTI_UBSPLINE_1D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *num_splines, multi_UBspline_1d_d **spline)
+{
+  Ugrid xgrid;
+  BCtype_d xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+
+  *spline = create_multi_UBspline_1d_d (xgrid, xBC, *num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_1d_c,FCREATE_MULTI_UBSPLINE_1D_C)
+  (double *x0, double *x1, int *num_x, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *num_splines, multi_UBspline_1d_c **spline)
+{
+  Ugrid xgrid;
+  BCtype_c xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+
+  *spline = create_multi_UBspline_1d_c (xgrid, xBC, *num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_1d_z,FCREATE_MULTI_UBSPLINE_1D_Z)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *num_splines, multi_UBspline_1d_z **spline)
+{
+  Ugrid xgrid;
+  BCtype_z xBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = creal(*x0_val);
+  xBC.lVal_i  = cimag(*x0_val);
+  xBC.rVal_r  = creal(*x1_val);
+  xBC.rVal_i  = cimag(*x1_val);
+
+  *spline = create_multi_UBspline_1d_z (xgrid, xBC, *num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_s,FSET_MULTI_UBSPLINE_1D_S)
+  (multi_UBspline_1d_s **spline, int *spline_num, float *data)
+{
+  set_multi_UBspline_1d_s (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_d,FSET_MULTI_UBSPLINE_1D_D)
+  (multi_UBspline_1d_d **spline,  int *spline_num, double *data) 
+{
+  set_multi_UBspline_1d_d (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_c,FSET_MULTI_UBSPLINE_1D_C)
+  (multi_UBspline_1d_c **spline, int *spline_num, complex_float *data)
+{
+  set_multi_UBspline_1d_c (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_z,FSET_MULTI_UBSPLINE_1D_Z)
+  (multi_UBspline_1d_z **spline,  int *spline_num, complex_double *data) 
+{
+  set_multi_UBspline_1d_z (*spline, *spline_num, data);
+}
+
+////////
+// 2D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_s,FCREATE_MULTI_UBSPLINE_2D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int *num_splines, multi_UBspline_2d_s **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_s xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  *spline = create_multi_UBspline_2d_s (xgrid, ygrid, xBC, yBC, *num_splines);
+}
+
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_d,FCREATE_MULTI_UBSPLINE_2D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int *num_splines, multi_UBspline_2d_d **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_d xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  *spline = create_multi_UBspline_2d_d (xgrid, ygrid, xBC, yBC, *num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_c,FCREATE_MULTI_UBSPLINE_2D_C)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   int *num_splines, multi_UBspline_2d_c **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_c xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = crealf(*y0_val);
+  yBC.lVal_i  = cimagf(*y0_val);
+  yBC.rVal_r  = crealf(*y1_val);
+  yBC.rVal_i  = cimagf(*y1_val);
+
+  *spline = create_multi_UBspline_2d_c (xgrid, ygrid, xBC, yBC, *num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_z,FCREATE_MULTI_UBSPLINE_2D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   int *num_splines, multi_UBspline_2d_z **spline)
+{
+  Ugrid  xgrid, ygrid;
+  BCtype_z xBC, yBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = creal(*y0_val);
+  yBC.lVal_i  = cimag(*y0_val);
+  yBC.rVal_r  = creal(*y1_val);
+  yBC.rVal_i  = cimag(*y1_val);
+
+  *spline = create_multi_UBspline_2d_z (xgrid, ygrid, xBC, yBC, *num_splines);
+}
+
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_s,FSET_MULTI_UBSPLINE_2D_S)
+  (multi_UBspline_2d_s **spline,  int *spline_num, float *data)
+{
+  set_multi_UBspline_2d_s (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_d,FSET_MULTI_UBSPLINE_2D_D)
+  (multi_UBspline_2d_d **spline, int *spline_num, double *data) 
+{
+  set_multi_UBspline_2d_d (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_c,FSET_MULTI_UBSPLINE_2D_C)
+  (multi_UBspline_2d_c **spline, int *spline_num, complex_float *data)
+{
+  set_multi_UBspline_2d_c (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_z,FSET_MULTI_UBSPLINE_2D_Z)
+  (multi_UBspline_2d_z **spline, int *spline_num, complex_double *data) 
+{
+  set_multi_UBspline_2d_z (*spline, *spline_num, data);
+}
+
+
+////////
+// 3D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_s,FCREATE_MULTI_UBSPLINE_3D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   double   *z0, double    *z1, int   *num_z, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int *z0_code, float *z0_val, int *z1_code, float *z1_val,
+   int *num_splines, multi_UBspline_3d_s **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_s xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal  = *z0_val;
+  zBC.rVal  = *z1_val;
+  *spline = create_multi_UBspline_3d_s (xgrid, ygrid, zgrid, xBC, yBC, zBC, 
+					*num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_d,FCREATE_MULTI_UBSPLINE_3D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   double   *z0, double     *z1, int   *num_z, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int *z0_code, double *z0_val, int *z1_code, double *z1_val,
+   int *num_splines, multi_UBspline_3d_d **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_d xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal  = *z0_val;
+  zBC.rVal  = *z1_val;
+  *spline = create_multi_UBspline_3d_d (xgrid, ygrid, zgrid, xBC, yBC, zBC, 
+					*num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_c,FCREATE_MULTI_UBSPLINE_3D_C)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
+   int *num_splines, multi_UBspline_3d_c **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_c xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = crealf(*y0_val);
+  yBC.lVal_i  = cimagf(*y0_val);
+  yBC.rVal_r  = crealf(*y1_val);
+  yBC.rVal_i  = cimagf(*y1_val);
+
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal_r  = crealf(*z0_val);
+  zBC.lVal_i  = cimagf(*z0_val);
+  zBC.rVal_r  = crealf(*z1_val);
+  zBC.rVal_i  = cimagf(*z1_val);
+
+  *spline = create_multi_UBspline_3d_c (xgrid, ygrid, zgrid, xBC, yBC, zBC, 
+					*num_splines);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_z,FCREATE_MULTI_UBSPLINE_3D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code,  complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code,  complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   int *z0_code,  complex_double *z0_val, int *z1_code, complex_double *z1_val,
+   int *num_splines, multi_UBspline_3d_z **spline)
+{
+  Ugrid  xgrid, ygrid, zgrid;
+  BCtype_z xBC, yBC, zBC;
+  xgrid.start = *x0;
+  xgrid.end   = *x1;
+  xgrid.num   = *num_x;
+  ygrid.start = *y0;
+  ygrid.end   = *y1;
+  ygrid.num   = *num_y;
+  zgrid.start = *z0;
+  zgrid.end   = *z1;
+  zgrid.num   = *num_z;
+ 
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = creal(*x0_val);
+  xBC.lVal_i  = cimag(*x0_val);
+  xBC.rVal_r  = creal(*x1_val);
+  xBC.rVal_i  = cimag(*x1_val);
+
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = creal(*y0_val);
+  yBC.lVal_i  = cimag(*y0_val);
+  yBC.rVal_r  = creal(*y1_val);
+  yBC.rVal_i  = cimag(*y1_val);
+
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal_r  = creal(*z0_val);
+  zBC.lVal_i  = cimag(*z0_val);
+  zBC.rVal_r  = creal(*z1_val);
+  zBC.rVal_i  = cimag(*z1_val);
+
+  *spline = create_multi_UBspline_3d_z (xgrid, ygrid, zgrid, xBC, yBC, zBC, 
+					*num_splines);
+}
+
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_s,FSET_MULTI_UBSPLINE_3D_S)
+  (multi_UBspline_3d_s **spline, int *spline_num, float *data)
+{
+  set_multi_UBspline_3d_s (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_d,FSET_MULTI_UBSPLINE_3D_D)
+  (multi_UBspline_3d_d **spline, int *spline_num, double *data) 
+{
+  set_multi_UBspline_3d_d (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_c,FSET_MULTI_UBSPLINE_3D_C)
+  (multi_UBspline_3d_c **spline, int *spline_num, complex_float *data)
+{
+  set_multi_UBspline_3d_c (*spline, *spline_num, data);
+}
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_z,FSET_MULTI_UBSPLINE_3D_Z)
+  (multi_UBspline_3d_z **spline, int *spline_num, complex_double *data) 
+{
+  set_multi_UBspline_3d_z (*spline, *spline_num, data);
+}
+
+
+/////////////////////////
+// Evaluation routines //
+/////////////////////////
+
+//////////////////////////////
+// 1D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_s,FEVAL_MULTI_UBSPLINE_1D_S)
+  (multi_UBspline_1d_s **spline, double *x, float *val)
+{
+  eval_multi_UBspline_1d_s (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_s_vg,FEVAL_MULTI_UBSPLINE_1D_S_VG)
+  (multi_UBspline_1d_s **spline, double *x, float *val, float *grad)
+{
+  eval_multi_UBspline_1d_s_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_s_vgl,FEVAL_MULTI_UBSPLINE_1D_S_VGL)
+  (multi_UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *lapl)
+{
+  eval_multi_UBspline_1d_s_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_s_vgh,FEVAL_MULTI_UBSPLINE_1D_S_VGH)
+  (multi_UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *hess)
+{
+  eval_multi_UBspline_1d_s_vgh (*spline, *x, val, grad, hess);
+}
+
+//////////////////////////////
+// 1D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d,FEVAL_MULTI_UBSPLINE_1D_D)
+  (multi_UBspline_1d_d **spline, double *x, double *val)
+{
+  eval_multi_UBspline_1d_d (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d_vg,FEVAL_MULTI_UBSPLINE_1D_D_VG)
+  (multi_UBspline_1d_d **spline, double *x, 
+   double *val, double *grad)
+{
+  eval_multi_UBspline_1d_d_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d_vgl,FEVAL_MULTI_UBSPLINE_1D_D_VGL)
+  (multi_UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *lapl)
+{
+  eval_multi_UBspline_1d_d_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d_vgh,FEVAL_MULTI_UBSPLINE_1D_D_VGH)
+  (multi_UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *hess)
+{
+  eval_multi_UBspline_1d_d_vgh (*spline, *x, val, grad, hess);
+}
+
+/////////////////////////////////
+// 1D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c,FEVAL_MULTI_UBSPLINE_1D_C)
+  (multi_UBspline_1d_c **spline, double *x, complex_float *val)
+{
+  eval_multi_UBspline_1d_c (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c_vg,FEVAL_MULTI_UBSPLINE_1D_C_VG)
+  (multi_UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad)
+{
+  eval_multi_UBspline_1d_c_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c_vgl,FEVAL_MULTI_UBSPLINE_1D_C_VGL)
+  (multi_UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_multi_UBspline_1d_c_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c_vgh,FEVAL_MULTI_UBSPLINE_1D_C_VGH)
+  (multi_UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_multi_UBspline_1d_c_vgh (*spline, *x, val, grad, hess);
+}
+
+/////////////////////////////////
+// 1D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z,FEVAL_MULTI_UBSPLINE_1D_Z)
+  (multi_UBspline_1d_z **spline, double *x, complex_double *val)
+{
+  eval_multi_UBspline_1d_z (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z_vg,FEVAL_MULTI_UBSPLINE_1D_Z_VG)
+  (multi_UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad)
+{
+  eval_multi_UBspline_1d_z_vg (*spline, *x, val, grad);
+}
+ 
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z_vgl,FEVAL_MULTI_UBSPLINE_1D_Z_VGL)
+  (multi_UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_multi_UBspline_1d_z_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z_vgh,FEVAL_MULTI_UBSPLINE_1D_Z_VGH)
+  (multi_UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_multi_UBspline_1d_z_vgh (*spline, *x, val, grad, hess);
+}
+
+//////////////////////////////
+// 2D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s,FEVAL_MULTI_UBSPLINE_2D_S)
+  (multi_UBspline_2d_s **spline, double *x, double *y, float *val)
+{
+  eval_multi_UBspline_2d_s (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s_vg,FEVAL_MULTI_UBSPLINE_2D_S_VG)
+  (multi_UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad)
+{
+  eval_multi_UBspline_2d_s_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s_vgl,FEVAL_MULTI_UBSPLINE_2D_S_VGL)
+  (multi_UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float* lapl)
+{
+  eval_multi_UBspline_2d_s_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s_vgh,FEVAL_MULTI_UBSPLINE_2D_S_VGH)
+  (multi_UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float *hess)
+{
+  eval_multi_UBspline_2d_s_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+//////////////////////////////
+// 2D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d,FEVAL_MULTI_UBSPLINE_2D_D)
+  (multi_UBspline_2d_d **spline, double *x, double *y, double *val)
+{
+  eval_multi_UBspline_2d_d (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d_vg,FEVAL_MULTI_UBSPLINE_2D_D_VG)
+  (multi_UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad)
+{
+  eval_multi_UBspline_2d_d_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d_vgl,FEVAL_MULTI_UBSPLINE_2D_D_VGL)
+  (multi_UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *lapl)
+{
+  eval_multi_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d_vgh,FEVAL_MULTI_UBSPLINE_2D_D_VGH)
+  (multi_UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *hess)
+{
+  eval_multi_UBspline_2d_d_vgl (*spline, *x, *y, val, grad, hess);
+}
+
+/////////////////////////////////
+// 2D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c,FEVAL_MULTI_UBSPLINE_2D_C)
+  (multi_UBspline_2d_c **spline, double *x, double *y, complex_float *val)
+{
+  eval_multi_UBspline_2d_c (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c_vg,FEVAL_MULTI_UBSPLINE_2D_C_VG)
+  (multi_UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad)
+{
+  eval_multi_UBspline_2d_c_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c_vgl,FEVAL_MULTI_UBSPLINE_2D_C_VGL)
+  (multi_UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_multi_UBspline_2d_c_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c_vgh,FEVAL_MULTI_UBSPLINE_2D_C_VGH)
+  (multi_UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_multi_UBspline_2d_c_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+/////////////////////////////////
+// 2D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z,FEVAL_MULTI_UBSPLINE_2D_Z)
+  (multi_UBspline_2d_z **spline, double *x, double *y, complex_double *val)
+{
+  eval_multi_UBspline_2d_z (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z_vg,FEVAL_MULTI_UBSPLINE_2D_Z_VG)
+  (multi_UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad)
+{
+  eval_multi_UBspline_2d_z_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z_vgl,FEVAL_MULTI_UBSPLINE_2D_Z_VGL)
+  (multi_UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_multi_UBspline_2d_z_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z_vgh,FEVAL_MULTI_UBSPLINE_2D_Z_VGH)
+  (multi_UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_multi_UBspline_2d_z_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+
+
+//////////////////////////////
+// 3D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s,FEVAL_MULTI_UBSPLINE_3D_S)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val)
+{
+  eval_multi_UBspline_3d_s (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s_vg,FEVAL_MULTI_UBSPLINE_3D_S_VG)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad)
+{
+  eval_multi_UBspline_3d_s_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s_vgl,FEVAL_MULTI_UBSPLINE_3D_S_VGL)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad, float* lapl)
+{
+  eval_multi_UBspline_3d_s_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s_vgh,FEVAL_MULTI_UBSPLINE_3D_S_VGH)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z, 
+   float *val, float *grad, float *hess)
+{
+  eval_multi_UBspline_3d_s_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+//////////////////////////////
+// 3D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d,FEVAL_MULTI_UBSPLINE_3D_D)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val)
+{
+  eval_multi_UBspline_3d_d (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d_vg,FEVAL_MULTI_UBSPLINE_3D_D_VG)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad)
+{
+  eval_multi_UBspline_3d_d_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d_vgl,FEVAL_MULTI_UBSPLINE_3D_D_VGL)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,  
+   double *val, double *grad, double *lapl)
+{
+  eval_multi_UBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d_vgh,FEVAL_MULTI_UBSPLINE_3D_D_VGH)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad, double *hess)
+{
+  eval_multi_UBspline_3d_d_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+/////////////////////////////////
+// 3D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c,FEVAL_MULTI_UBSPLINE_3D_C)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val)
+{
+  eval_multi_UBspline_3d_c (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c_vg,FEVAL_MULTI_UBSPLINE_3D_C_VG)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z, 
+   complex_float *val, complex_float *grad)
+{
+  eval_multi_UBspline_3d_c_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c_vgl,FEVAL_MULTI_UBSPLINE_3D_C_VGL)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_multi_UBspline_3d_c_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c_vgh,FEVAL_MULTI_UBSPLINE_3D_C_VGH)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_multi_UBspline_3d_c_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+/////////////////////////////////
+// 3D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z,FEVAL_MULTI_UBSPLINE_3D_Z)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val)
+{
+  eval_multi_UBspline_3d_z (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z_vg,FEVAL_MULTI_UBSPLINE_3D_Z_VG)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z, 
+   complex_double *val, complex_double *grad)
+{
+  eval_multi_UBspline_3d_z_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z_vgl,FEVAL_MULTI_UBSPLINE_3D_Z_VGL)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_multi_UBspline_3d_z_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z_vgh,FEVAL_MULTI_UBSPLINE_3D_Z_VGH)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_multi_UBspline_3d_z_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+
--- a/src/einspline/fmulti_bspline.h
+++ b/src/einspline/fmulti_bspline.h
@ -0,0 +1,440 @@
+#ifndef FMULTI_BSPLINE_H
+#define FMULTI_BSPLINE_H
+
+#include "config.h"
+#include "bspline_base.h"
+#include "bspline_create.h"
+
+#ifdef __cplusplus
+#define CFUNC extern "C" /* Avoid name mangling in C++ */
+#else
+#define CFUNC
+#endif
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                       Creation routines                      ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+////////
+// 1D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_1d_s,FCREATE_MULTI_UBSPLINE_1D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *num_spline, multi_UBspline_1d_s **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_1d_d,FCREATE_MULTI_UBSPLINE_1D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *num_splines, multi_UBspline_1d_d **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_1d_c,FCREATE_MULTI_UBSPLINE_1D_C)
+  (double   *x0, double    *x1, int   *num_x, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *num_splines, multi_UBspline_1d_c **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_1d_z,FCREATE_MULTI_UBSPLINE_1D_Z)
+  (double   *x0, double     *x1, int   *num_x, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *num_splines, multi_UBspline_1d_z **spline);
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_s,FSET_MULTI_UBSPLINE_1D_S)
+  (multi_UBspline_1d_s **spline, int *spline_num, float *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_d,FSET_MULTI_UBSPLINE_1D_D)
+  (multi_UBspline_1d_d **spline, int *spline_num, double *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_c,FSET_MULTI_UBSPLINE_1D_C)
+  (multi_UBspline_1d_c **spline, int *spline_num, complex_float *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_1d_z,FSET_MULTI_UBSPLINE_1D_Z)
+  (multi_UBspline_1d_z **spline, int *spline_num, complex_double *data);
+
+////////
+// 2D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_s,FCREATE_MULTI_UBSPLINE_2D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int *num_splines, multi_UBspline_2d_s **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_d,FCREATE_MULTI_UBSPLINE_2D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int *num_splines, multi_UBspline_2d_d **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_c,FCREATE_MULTI_UBSPLINE_2D_C)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   int *num_splines, multi_UBspline_2d_c **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_2d_z,FCREATE_MULTI_UBSPLINE_2D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   int *x0_code, complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   int *num_splines, multi_UBspline_2d_z **spline);
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_s,FSET_MULTI_UBSPLINE_2D_S)
+  (multi_UBspline_2d_s **spline, int *spline_num, float *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_d,FSET_MULTI_UBSPLINE_2D_D)
+  (multi_UBspline_2d_d **spline, int *spline_num, double *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_c,FSET_MULTI_UBSPLINE_2D_C)
+  (multi_UBspline_2d_c **spline, int *spline_num, complex_float *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_2d_z,FSET_MULTI_UBSPLINE_2D_Z)
+  (multi_UBspline_2d_z **spline, int *spline_num, complex_double *data);
+
+////////
+// 3D //
+////////
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_s,FCREATE_MULTI_UBSPLINE_3D_S)
+  (double   *x0, double    *x1, int   *num_x, 
+   double   *y0, double    *y1, int   *num_y, 
+   double   *z0, double    *z1, int   *num_z, 
+   int *x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int *y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int *z0_code, float *z0_val, int *z1_code, float *z1_val,
+   int *num_splines, multi_UBspline_3d_s **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_d,FCREATE_MULTI_UBSPLINE_3D_D)
+  (double   *x0, double     *x1, int   *num_x, 
+   double   *y0, double     *y1, int   *num_y, 
+   double   *z0, double     *z1, int   *num_z, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int *z0_code, double *z0_val, int *z1_code, double *z1_val,
+   int *num_splines, multi_UBspline_3d_d **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_c,FCREATE_MULTI_UBSPLINE_3D_C)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code, complex_float *x0_val, int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, int *y1_code, complex_float *y1_val,
+   int *z0_code, complex_float *z0_val, int *z1_code, complex_float *z1_val,
+   int *num_splines, multi_UBspline_3d_c **spline);
+CFUNC void 
+F77_FUNC_(fcreate_multi_ubspline_3d_z,FCREATE_MULTI_UBSPLINE_3D_Z)
+  (double *x0, double *x1, int *num_x, 
+   double *y0, double *y1, int *num_y, 
+   double *z0, double *z1, int *num_z, 
+   int *x0_code,  complex_double *x0_val, int *x1_code, complex_double *x1_val,
+   int *y0_code,  complex_double *y0_val, int *y1_code, complex_double *y1_val,
+   int *z0_code,  complex_double *z0_val, int *z1_code, complex_double *z1_val,
+   int *num_splines, multi_UBspline_3d_z **spline);
+
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_s,FSET_MULTI_UBSPLINE_3D_S)
+  (multi_UBspline_3d_s **spline, int *spline_num, float *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_d,FSET_MULTI_UBSPLINE_3D_D)
+  (multi_UBspline_3d_d **spline, int *spline_num, double *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_c,FSET_MULTI_UBSPLINE_3D_C)
+  (multi_UBspline_3d_c **spline, int *spline_num, complex_float *data);
+CFUNC void 
+F77_FUNC_(fset_multi_ubspline_3d_z,FSET_MULTI_UBSPLINE_3D_Z)
+  (multi_UBspline_3d_z **spline, int *spline_num, complex_double *data);
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                      Destruction routine                     ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+CFUNC void
+F77_FUNC_(fdestroy_bspline,FDESTROY_BSPLINE)
+  (Bspline **spline);
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                      Evaluation routines                     ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+//////////////////////////////
+// 1D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s,FEVAL_UBSPLINE_1D_S)
+  (multi_UBspline_1d_s **spline, double *x, float *val);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vg,FEVAL_UBSPLINE_1D_S_VG)
+  (multi_UBspline_1d_s **spline, double *x, float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vgl,FEVAL_UBSPLINE_1D_S_VGL)
+  (multi_UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_ubspline_1d_s_vgh,FEVAL_UBSPLINE_1D_S_VGH)
+  (multi_UBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 1D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d,FEVAL_MULTI_UBSPLINE_1D_D)
+  (multi_UBspline_1d_d **spline, double *x, double *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d_vg,FEVAL_MULTI_UBSPLINE_1D_D_VG)
+  (multi_UBspline_1d_d **spline, double *x, 
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d_vgl,FEVAL_MULTI_UBSPLINE_1D_D_VGL)
+  (multi_UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_d_vgh,FEVAL_MULTI_UBSPLINE_1D_D_VGH)
+  (multi_UBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 1D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c,FEVAL_MULTI_UBSPLINE_1D_C)
+  (multi_UBspline_1d_c **spline, double *x, complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c_vg,FEVAL_MULTI_UBSPLINE_1D_C_VG)
+  (multi_UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c_vgl,FEVAL_MULTI_UBSPLINE_1D_C_VGL)
+  (multi_UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_c_vgh,FEVAL_MULTI_UBSPLINE_1D_C_VGH)
+  (multi_UBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 1D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z,FEVAL_MULTI_UBSPLINE_1D_Z)
+  (multi_UBspline_1d_z **spline, double *x, complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z_vg,FEVAL_MULTI_UBSPLINE_1D_Z_VG)
+  (multi_UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad);
+ 
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z_vgl,FEVAL_MULTI_UBSPLINE_1D_Z_VGL)
+  (multi_UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_1d_z_vgh,FEVAL_MULTI_UBSPLINE_1D_Z_VGH)
+  (multi_UBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+//////////////////////////////
+// 2D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s,FEVAL_MULTI_UBSPLINE_2D_S)
+  (multi_UBspline_2d_s **spline, double *x, double *y, float *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s_vg,FEVAL_MULTI_UBSPLINE_2D_S_VG)
+  (multi_UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s_vgl,FEVAL_MULTI_UBSPLINE_2D_S_VGL)
+  (multi_UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float* lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_s_vgh,FEVAL_MULTI_UBSPLINE_2D_S_VGH)
+  (multi_UBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 2D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d,FEVAL_MULTI_UBSPLINE_2D_D)
+  (multi_UBspline_2d_d **spline, double *x, double *y, double *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d_vg,FEVAL_MULTI_UBSPLINE_2D_D_VG)
+  (multi_UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d_vgl,FEVAL_MULTI_UBSPLINE_2D_D_VGL)
+  (multi_UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_d_vgh,FEVAL_MULTI_UBSPLINE_2D_D_VGH)
+  (multi_UBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 2D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c,FEVAL_MULTI_UBSPLINE_2D_C)
+  (multi_UBspline_2d_c **spline, double *x, double *y, complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c_vg,FEVAL_MULTI_UBSPLINE_2D_C_VG)
+  (multi_UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c_vgl,FEVAL_MULTI_UBSPLINE_2D_C_VGL)
+  (multi_UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_c_vgh,FEVAL_MULTI_UBSPLINE_2D_C_VGH)
+  (multi_UBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 2D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z,FEVAL_MULTI_UBSPLINE_2D_Z)
+  (multi_UBspline_2d_z **spline, double *x, double *y, complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z_vg,FEVAL_MULTI_UBSPLINE_2D_Z_VG)
+  (multi_UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z_vgl,FEVAL_MULTI_UBSPLINE_2D_Z_VGL)
+  (multi_UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_2d_z_vgh,FEVAL_MULTI_UBSPLINE_2D_Z_VGH)
+  (multi_UBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+
+//////////////////////////////
+// 3D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s,FEVAL_MULTI_UBSPLINE_3D_S)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s_vg,FEVAL_MULTI_UBSPLINE_3D_S_VG)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s_vgl,FEVAL_MULTI_UBSPLINE_3D_S_VGL)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad, float* lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_s_vgh,FEVAL_MULTI_UBSPLINE_3D_S_VGH)
+  (multi_UBspline_3d_s **spline, double *x, double *y, double *z, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 3D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d,FEVAL_MULTI_UBSPLINE_3D_D)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d_vg,FEVAL_MULTI_UBSPLINE_3D_D_VG)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d_vgl,FEVAL_MULTI_UBSPLINE_3D_D_VGL)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,  
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_d_vgh,FEVAL_MULTI_UBSPLINE_3D_D_VGH)
+  (multi_UBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 3D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c,FEVAL_MULTI_UBSPLINE_3D_C)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c_vg,FEVAL_MULTI_UBSPLINE_3D_C_VG)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c_vgl,FEVAL_MULTI_UBSPLINE_3D_C_VGL)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_c_vgh,FEVAL_MULTI_UBSPLINE_3D_C_VGH)
+  (multi_UBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 3D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z,FEVAL_MULTI_UBSPLINE_3D_Z)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z_vg,FEVAL_MULTI_UBSPLINE_3D_Z_VG)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z, 
+   complex_double *val, complex_double *grad);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z_vgl,FEVAL_MULTI_UBSPLINE_3D_Z_VGL)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_multi_ubspline_3d_z_vgh,FEVAL_MULTI_UBSPLINE_3D_Z_VGH)
+  (multi_UBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+
+#undef CFUNC
+#endif
--- a/src/einspline/fnubspline.c
+++ b/src/einspline/fnubspline.c
@ -0,0 +1,763 @@
+#include "fnubspline.h"
+
+#include "config.h"
+#include "nubspline_create.h"
+#include "nubspline.h"
+
+#ifdef __cplusplus
+#define CFUNC extern "C" /* Avoid name mangling in C++ */
+#else
+#define CFUNC
+#endif
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                    Grid Creation routines                    ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+CFUNC void 
+F77_FUNC_(fcreate_general_grid,FCREATE_GENERAL_GRID)
+  (double *points, int *num_points, NUgrid **grid)
+{
+  *grid = create_general_grid (points, *num_points);
+}
+
+CFUNC void 
+F77_FUNC_(fcreate_center_grid,FCREATE_CENTER_GRID)
+  (double *start, double *end, double *ratio,
+   int *num_points, NUgrid **grid)
+{
+  *grid = create_center_grid (*start, *end, *ratio, *num_points);
+}
+
+CFUNC void
+F77_FUNC_(fdestroy_grid,FDESTROY_GRID)
+  (NUgrid **grid)
+{
+  destroy_grid (*grid);
+}
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////            Nonuniform spline creation routines               ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+////////
+// 1D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_s,FCREATE_NUBSPLINE_1D_S)
+  (NUgrid **x_grid, 
+   int* x0_code, float *x0_val, int *x1_code, float *x1_val,
+   float *data, NUBspline_1d_s **spline)
+{
+  BCtype_s xBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+
+  *spline = create_NUBspline_1d_s (*x_grid, xBC, data);
+}
+
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_d,FCREATE_NUBSPLINE_1D_D)
+  (NUgrid **x_grid, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   double *data, NUBspline_1d_d **spline)
+{
+  BCtype_d xBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+
+  *spline = create_NUBspline_1d_d (*x_grid, xBC, data);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_c,FCREATE_NUBSPLINE_1D_C)
+  (NUgrid **x_grid, 
+   int *x0_code, complex_float *x0_val, 
+   int *x1_code, complex_float *x1_val,
+   complex_float *data, NUBspline_1d_c **spline)
+{
+  BCtype_c xBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+
+  *spline = create_NUBspline_1d_c (*x_grid, xBC, data);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_z,FCREATE_NUBSPLINE_1D_Z)
+  (NUgrid **x_grid, 
+   int *x0_code, complex_double *x0_val, 
+   int *x1_code, complex_double *x1_val,
+   complex_double *data, NUBspline_1d_z **spline)
+{
+  BCtype_z xBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = creal(*x0_val);
+  xBC.lVal_i  = cimag(*x0_val);
+  xBC.rVal_r  = creal(*x1_val);
+  xBC.rVal_i  = cimag(*x1_val);
+
+  *spline = create_NUBspline_1d_z (*x_grid, xBC, data);
+}
+
+////////
+// 2D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_s,FCREATE_NUBSPLINE_2D_S)
+  (NUgrid **x_grid, NUgrid **y_grid, 
+   int* x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int* y0_code, float *y0_val, int *y1_code, float *y1_val,
+   float *data, NUBspline_2d_s **spline)
+{
+  BCtype_s xBC, yBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+
+  *spline = create_NUBspline_2d_s (*x_grid, *y_grid, xBC, yBC, data);
+}
+
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_d,FCREATE_NUBSPLINE_2D_D)
+  (NUgrid **x_grid, NUgrid **y_grid,
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   double *data, NUBspline_2d_d **spline)
+{
+  BCtype_d xBC, yBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+
+  *spline = create_NUBspline_2d_d (*x_grid, *y_grid, xBC, yBC, data);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_c,FCREATE_NUBSPLINE_2D_C)
+  (NUgrid **x_grid, NUgrid **y_grid,
+   int *x0_code, complex_float *x0_val, 
+   int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, 
+   int *y1_code, complex_float *y1_val,
+   complex_float *data, NUBspline_2d_c **spline)
+{
+  BCtype_c xBC, yBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+  
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = crealf(*y0_val);
+  yBC.lVal_i  = cimagf(*y0_val);
+  yBC.rVal_r  = crealf(*y1_val);
+  yBC.rVal_i  = cimagf(*y1_val);
+
+  *spline = create_NUBspline_2d_c (*x_grid, *y_grid, xBC, yBC, data);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_z,FCREATE_NUBSPLINE_2D_Z)
+  (NUgrid **x_grid, NUgrid **y_grid,
+   int *x0_code, complex_double *x0_val, 
+   int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, 
+   int *y1_code, complex_double *y1_val,
+   complex_double *data, NUBspline_2d_z **spline)
+{
+  BCtype_z xBC, yBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = creal(*x0_val);
+  xBC.lVal_i  = cimag(*x0_val);
+  xBC.rVal_r  = creal(*x1_val);
+  xBC.rVal_i  = cimag(*x1_val);
+
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = creal(*y0_val);
+  yBC.lVal_i  = cimag(*y0_val);
+  yBC.rVal_r  = creal(*y1_val);
+  yBC.rVal_i  = cimag(*y1_val);
+
+  *spline = create_NUBspline_2d_z (*x_grid, *y_grid, xBC, yBC, data);
+}
+
+////////
+// 3D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_s,FCREATE_NUBSPLINE_3D_S)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int* x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int* y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int* z0_code, float *z0_val, int *z1_code, float *z1_val,
+   float *data, NUBspline_3d_s **spline)
+{
+  BCtype_s xBC, yBC, zBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal  = *z0_val;
+  zBC.rVal  = *z1_val;
+
+  *spline = create_NUBspline_3d_s (*x_grid, *y_grid, *z_grid,
+				   xBC, yBC, zBC, data);
+}
+
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_d,FCREATE_NUBSPLINE_3D_D)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int* z0_code, float *z0_val, int *z1_code, float *z1_val,
+   double *data, NUBspline_3d_d **spline)
+{
+  BCtype_d xBC, yBC, zBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal  = *x0_val;
+  xBC.rVal  = *x1_val;
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal  = *y0_val;
+  yBC.rVal  = *y1_val;
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal  = *z0_val;
+  zBC.rVal  = *z1_val;
+
+  *spline = create_NUBspline_3d_d (*x_grid, *y_grid, *z_grid,
+				   xBC, yBC, zBC, data);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_c,FCREATE_NUBSPLINE_3D_C)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int *x0_code, complex_float *x0_val, 
+   int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, 
+   int *y1_code, complex_float *y1_val,
+   int *z0_code, complex_float *z0_val, 
+   int *z1_code, complex_float *z1_val,
+   complex_float *data, NUBspline_3d_c **spline)
+{
+  BCtype_c xBC, yBC, zBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = crealf(*x0_val);
+  xBC.lVal_i  = cimagf(*x0_val);
+  xBC.rVal_r  = crealf(*x1_val);
+  xBC.rVal_i  = cimagf(*x1_val);
+  
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = crealf(*y0_val);
+  yBC.lVal_i  = cimagf(*y0_val);
+  yBC.rVal_r  = crealf(*y1_val);
+  yBC.rVal_i  = cimagf(*y1_val);
+
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal_r  = crealf(*z0_val);
+  zBC.lVal_i  = cimagf(*z0_val);
+  zBC.rVal_r  = crealf(*z1_val);
+  zBC.rVal_i  = cimagf(*z1_val);
+
+  *spline = create_NUBspline_3d_c (*x_grid, *y_grid, *z_grid,
+				   xBC, yBC, zBC, data);
+}
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_z,FCREATE_NUBSPLINE_3D_Z)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int *x0_code, complex_double *x0_val, 
+   int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, 
+   int *y1_code, complex_double *y1_val,
+   int *z0_code, complex_float *z0_val, 
+   int *z1_code, complex_float *z1_val,
+   complex_double *data, NUBspline_3d_z **spline)
+{
+  BCtype_z xBC, yBC, zBC;
+  xBC.lCode = (bc_code) *x0_code;
+  xBC.rCode = (bc_code) *x1_code;
+  xBC.lVal_r  = creal(*x0_val);
+  xBC.lVal_i  = cimag(*x0_val);
+  xBC.rVal_r  = creal(*x1_val);
+  xBC.rVal_i  = cimag(*x1_val);
+
+  yBC.lCode = (bc_code) *y0_code;
+  yBC.rCode = (bc_code) *y1_code;
+  yBC.lVal_r  = creal(*y0_val);
+  yBC.lVal_i  = cimag(*y0_val);
+  yBC.rVal_r  = creal(*y1_val);
+  yBC.rVal_i  = cimag(*y1_val);
+
+  zBC.lCode = (bc_code) *z0_code;
+  zBC.rCode = (bc_code) *z1_code;
+  zBC.lVal_r  = creal(*z0_val);
+  zBC.lVal_i  = cimag(*z0_val);
+  zBC.rVal_r  = creal(*z1_val);
+  zBC.rVal_i  = cimag(*z1_val);
+
+  *spline = create_NUBspline_3d_z (*x_grid, *y_grid, *z_grid,
+				   xBC, yBC, zBC, data);
+}
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////           Nonuniform spline evaluation routines              ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+//////////////////////////////
+// 1D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s,FEVAL_NUBSPLINE_1D_S)
+  (NUBspline_1d_s **spline, double *x, float *val)
+{
+  eval_NUBspline_1d_s (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s_vg,FEVAL_NUBSPLINE_1D_S_VG)
+  (NUBspline_1d_s **spline, double *x, float *val, float *grad)
+{
+  eval_NUBspline_1d_s_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s_vgl,FEVAL_NUBSPLINE_1D_S_VGL)
+  (NUBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *lapl)
+{
+  eval_NUBspline_1d_s_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s_vgh,FEVAL_NUBSPLINE_1D_S_VGH)
+  (NUBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *hess)
+{
+  eval_NUBspline_1d_s_vgh (*spline, *x, val, grad, hess);
+}
+
+//////////////////////////////
+// 1D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d,FEVAL_NUBSPLINE_1D_D)
+  (NUBspline_1d_d **spline, double *x, double *val)
+{
+  eval_NUBspline_1d_d (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d_vg,FEVAL_NUBSPLINE_1D_D_VG)
+  (NUBspline_1d_d **spline, double *x, 
+   double *val, double *grad)
+{
+  eval_NUBspline_1d_d_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d_vgl,FEVAL_NUBSPLINE_1D_D_VGL)
+  (NUBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *lapl)
+{
+  eval_NUBspline_1d_d_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d_vgh,FEVAL_NUBSPLINE_1D_D_VGH)
+  (NUBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *hess)
+{
+  eval_NUBspline_1d_d_vgh (*spline, *x, val, grad, hess);
+}
+
+/////////////////////////////////
+// 1D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c,FEVAL_NUBSPLINE_1D_C)
+  (NUBspline_1d_c **spline, double *x, complex_float *val)
+{
+  eval_NUBspline_1d_c (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c_vg,FEVAL_NUBSPLINE_1D_C_VG)
+  (NUBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad)
+{
+  eval_NUBspline_1d_c_vg (*spline, *x, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c_vgl,FEVAL_NUBSPLINE_1D_C_VGL)
+  (NUBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_NUBspline_1d_c_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c_vgh,FEVAL_NUBSPLINE_1D_C_VGH)
+  (NUBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_NUBspline_1d_c_vgh (*spline, *x, val, grad, hess);
+}
+
+/////////////////////////////////
+// 1D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nnubspline_1d_z,FEVAL_NNUBSPLINE_1D_Z)
+  (NUBspline_1d_z **spline, double *x, complex_double *val)
+{
+  eval_NUBspline_1d_z (*spline, *x, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_z_vg,FEVAL_NUBSPLINE_1D_Z_VG)
+  (NUBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad)
+{
+  eval_NUBspline_1d_z_vg (*spline, *x, val, grad);
+}
+ 
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_z_vgl,FEVAL_NUBSPLINE_1D_Z_VGL)
+  (NUBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_NUBspline_1d_z_vgl (*spline, *x, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_z_vgh,FEVAL_NUBSPLINE_1D_Z_VGH)
+  (NUBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_NUBspline_1d_z_vgh (*spline, *x, val, grad, hess);
+}
+
+//////////////////////////////
+// 2D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s,FEVAL_NUBSPLINE_2D_S)
+  (NUBspline_2d_s **spline, double *x, double *y, float *val)
+{
+  eval_NUBspline_2d_s (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s_vg,FEVAL_NUBSPLINE_2D_S_VG)
+  (NUBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad)
+{
+  eval_NUBspline_2d_s_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s_vgl,FEVAL_NUBSPLINE_2D_S_VGL)
+  (NUBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float* lapl)
+{
+  eval_NUBspline_2d_s_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s_vgh,FEVAL_NUBSPLINE_2D_S_VGH)
+  (NUBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float *hess)
+{
+  eval_NUBspline_2d_s_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+//////////////////////////////
+// 2D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d,FEVAL_NUBSPLINE_2D_D)
+  (NUBspline_2d_d **spline, double *x, double *y, double *val)
+{
+  eval_NUBspline_2d_d (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d_vg,FEVAL_NUBSPLINE_2D_D_VG)
+  (NUBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad)
+{
+  eval_NUBspline_2d_d_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d_vgl,FEVAL_NUBSPLINE_2D_D_VGL)
+  (NUBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *lapl)
+{
+  eval_NUBspline_2d_d_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d_vgh,FEVAL_NUBSPLINE_2D_D_VGH)
+  (NUBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *hess)
+{
+  eval_NUBspline_2d_d_vgl (*spline, *x, *y, val, grad, hess);
+}
+
+/////////////////////////////////
+// 2D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c,FEVAL_NUBSPLINE_2D_C)
+  (NUBspline_2d_c **spline, double *x, double *y, complex_float *val)
+{
+  eval_NUBspline_2d_c (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c_vg,FEVAL_NUBSPLINE_2D_C_VG)
+  (NUBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad)
+{
+  eval_NUBspline_2d_c_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c_vgl,FEVAL_NUBSPLINE_2D_C_VGL)
+  (NUBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_NUBspline_2d_c_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c_vgh,FEVAL_NUBSPLINE_2D_C_VGH)
+  (NUBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_NUBspline_2d_c_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+/////////////////////////////////
+// 2D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z,FEVAL_NUBSPLINE_2D_Z)
+  (NUBspline_2d_z **spline, double *x, double *y, complex_double *val)
+{
+  eval_NUBspline_2d_z (*spline, *x, *y, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z_vg,FEVAL_NUBSPLINE_2D_Z_VG)
+  (NUBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad)
+{
+  eval_NUBspline_2d_z_vg (*spline, *x, *y, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z_vgl,FEVAL_NUBSPLINE_2D_Z_VGL)
+  (NUBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_NUBspline_2d_z_vgl (*spline, *x, *y, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z_vgh,FEVAL_NUBSPLINE_2D_Z_VGH)
+  (NUBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_NUBspline_2d_z_vgh (*spline, *x, *y, val, grad, hess);
+}
+
+
+
+//////////////////////////////
+// 3D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s,FEVAL_NUBSPLINE_3D_S)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val)
+{
+  eval_NUBspline_3d_s (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s_vg,FEVAL_NUBSPLINE_3D_S_VG)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad)
+{
+  eval_NUBspline_3d_s_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s_vgl,FEVAL_NUBSPLINE_3D_S_VGL)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad, float* lapl)
+{
+  eval_NUBspline_3d_s_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s_vgh,FEVAL_NUBSPLINE_3D_S_VGH)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z, 
+   float *val, float *grad, float *hess)
+{
+  eval_NUBspline_3d_s_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+//////////////////////////////
+// 3D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d,FEVAL_NUBSPLINE_3D_D)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val)
+{
+  eval_NUBspline_3d_d (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d_vg,FEVAL_NUBSPLINE_3D_D_VG)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad)
+{
+  eval_NUBspline_3d_d_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d_vgl,FEVAL_NUBSPLINE_3D_D_VGL)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,  
+   double *val, double *grad, double *lapl)
+{
+  eval_NUBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d_vgh,FEVAL_NUBSPLINE_3D_D_VGH)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad, double *hess)
+{
+  eval_NUBspline_3d_d_vgl (*spline, *x, *y, *z, val, grad, hess);
+}
+
+/////////////////////////////////
+// 3D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c,FEVAL_NUBSPLINE_3D_C)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val)
+{
+  eval_NUBspline_3d_c (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c_vg,FEVAL_NUBSPLINE_3D_C_VG)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z, 
+   complex_float *val, complex_float *grad)
+{
+  eval_NUBspline_3d_c_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c_vgl,FEVAL_NUBSPLINE_3D_C_VGL)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *lapl)
+{
+  eval_NUBspline_3d_c_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c_vgh,FEVAL_NUBSPLINE_3D_C_VGH)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *hess)
+{
+  eval_NUBspline_3d_c_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
+/////////////////////////////////
+// 3D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z,FEVAL_NUBSPLINE_3D_Z)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val)
+{
+  eval_NUBspline_3d_z (*spline, *x, *y, *z, val);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z_vg,FEVAL_NUBSPLINE_3D_Z_VG)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z, 
+   complex_double *val, complex_double *grad)
+{
+  eval_NUBspline_3d_z_vg (*spline, *x, *y, *z, val, grad);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z_vgl,FEVAL_NUBSPLINE_3D_Z_VGL)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *lapl)
+{
+  eval_NUBspline_3d_z_vgl (*spline, *x, *y, *z, val, grad, lapl);
+}
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z_vgh,FEVAL_NUBSPLINE_3D_Z_VGH)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *hess)
+{
+  eval_NUBspline_3d_z_vgh (*spline, *x, *y, *z, val, grad, hess);
+}
+
--- a/src/einspline/fnubspline.h
+++ b/src/einspline/fnubspline.h
@ -0,0 +1,418 @@
+#ifndef F_NUBSPLINE_H
+#define F_NUBSPLINE_H
+
+#include "config.h"
+#include "nugrid.h"
+#include "nubspline_structs.h"
+
+#ifdef __cplusplus
+#define CFUNC extern "C" /* Avoid name mangling in C++ */
+#else
+#define CFUNC
+#endif
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////                    Grid Creation routines                    ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+CFUNC void 
+F77_FUNC_(fcreate_general_grid,FCREATE_GENERAL_GRID)
+  (double *points, int *num_points, NUgrid **grid);
+
+CFUNC void 
+F77_FUNC_(fcreate_center_grid,FCREATE_CENTER_GRID)
+  (double *start, double *end, double *ratio,
+   int *num_points, NUgrid **grid);
+
+CFUNC void
+F77_FUNC_(fdestroy_grid,FDESTROY_GRID)
+  (NUgrid **grid);
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////            Nonuniform spline creation routines               ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+////////
+// 1D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_s,FCREATE_NUBSPLINE_1D_S)
+  (NUgrid **x_grid, 
+   int* x0_code, float *x0_val, int *x1_code, float *x1_val,
+   float *data, NUBspline_1d_s **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_d,FCREATE_NUBSPLINE_1D_D)
+  (NUgrid **x_grid, 
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   double *data, NUBspline_1d_d **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_c,FCREATE_NUBSPLINE_1D_C)
+  (NUgrid **x_grid, 
+   int *x0_code, complex_float *x0_val, 
+   int *x1_code, complex_float *x1_val,
+   complex_float *data, NUBspline_1d_c **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_1d_z,FCREATE_NUBSPLINE_1D_Z)
+  (NUgrid **x_grid, 
+   int *x0_code, complex_double *x0_val, 
+   int *x1_code, complex_double *x1_val,
+   complex_double *data, NUBspline_1d_z **spline);
+
+////////
+// 2D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_s,FCREATE_NUBSPLINE_2D_S)
+  (NUgrid **x_grid, NUgrid **y_grid, 
+   int* x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int* y0_code, float *y0_val, int *y1_code, float *y1_val,
+   float *data, NUBspline_2d_s **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_d,FCREATE_NUBSPLINE_2D_D)
+  (NUgrid **x_grid, NUgrid **y_grid,
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   double *data, NUBspline_2d_d **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_c,FCREATE_NUBSPLINE_2D_C)
+  (NUgrid **x_grid, NUgrid **y_grid,
+   int *x0_code, complex_float *x0_val, 
+   int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, 
+   int *y1_code, complex_float *y1_val,
+   complex_float *data, NUBspline_2d_c **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_2d_z,FCREATE_NUBSPLINE_2D_Z)
+  (NUgrid **x_grid, NUgrid **y_grid,
+   int *x0_code, complex_double *x0_val, 
+   int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, 
+   int *y1_code, complex_double *y1_val,
+   complex_double *data, NUBspline_2d_z **spline);
+
+////////
+// 3D //
+////////
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_s,FCREATE_NUBSPLINE_3D_S)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int* x0_code, float *x0_val, int *x1_code, float *x1_val,
+   int* y0_code, float *y0_val, int *y1_code, float *y1_val,
+   int* z0_code, float *z0_val, int *z1_code, float *z1_val,
+   float *data, NUBspline_3d_s **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_d,FCREATE_NUBSPLINE_3D_D)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int *x0_code, double *x0_val, int *x1_code, double *x1_val,
+   int *y0_code, double *y0_val, int *y1_code, double *y1_val,
+   int* z0_code, float *z0_val, int *z1_code, float *z1_val,
+   double *data, NUBspline_3d_d **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_c,FCREATE_NUBSPLINE_3D_C)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int *x0_code, complex_float *x0_val, 
+   int *x1_code, complex_float *x1_val,
+   int *y0_code, complex_float *y0_val, 
+   int *y1_code, complex_float *y1_val,
+   int *z0_code, complex_float *z0_val, 
+   int *z1_code, complex_float *z1_val,
+   complex_float *data, NUBspline_3d_c **spline);
+
+CFUNC void
+F77_FUNC_(fcreate_nubspline_3d_z,FCREATE_NUBSPLINE_3D_Z)
+  (NUgrid **x_grid, NUgrid **y_grid, NUgrid **z_grid,
+   int *x0_code, complex_double *x0_val, 
+   int *x1_code, complex_double *x1_val,
+   int *y0_code, complex_double *y0_val, 
+   int *y1_code, complex_double *y1_val,
+   int *z0_code, complex_float *z0_val, 
+   int *z1_code, complex_float *z1_val,
+   complex_double *data, NUBspline_3d_z **spline);
+
+
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+////           Nonuniform spline evaluation routines              ////
+//////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////
+
+//////////////////////////////
+// 1D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s,FEVAL_NUBSPLINE_1D_S)
+  (NUBspline_1d_s **spline, double *x, float *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s_vg,FEVAL_NUBSPLINE_1D_S_VG)
+  (NUBspline_1d_s **spline, double *x, float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s_vgl,FEVAL_NUBSPLINE_1D_S_VGL)
+  (NUBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_s_vgh,FEVAL_NUBSPLINE_1D_S_VGH)
+  (NUBspline_1d_s **spline, double *x, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 1D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d,FEVAL_NUBSPLINE_1D_D)
+  (NUBspline_1d_d **spline, double *x, double *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d_vg,FEVAL_NUBSPLINE_1D_D_VG)
+  (NUBspline_1d_d **spline, double *x, 
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d_vgl,FEVAL_NUBSPLINE_1D_D_VGL)
+  (NUBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_d_vgh,FEVAL_NUBSPLINE_1D_D_VGH)
+  (NUBspline_1d_d **spline, double *x, 
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 1D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c,FEVAL_NUBSPLINE_1D_C)
+  (NUBspline_1d_c **spline, double *x, complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c_vg,FEVAL_NUBSPLINE_1D_C_VG)
+  (NUBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c_vgl,FEVAL_NUBSPLINE_1D_C_VGL)
+  (NUBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_c_vgh,FEVAL_NUBSPLINE_1D_C_VGH)
+  (NUBspline_1d_c **spline, double *x, 
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 1D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nnubspline_1d_z,FEVAL_NNUBSPLINE_1D_Z)
+  (NUBspline_1d_z **spline, double *x, complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_z_vg,FEVAL_NUBSPLINE_1D_Z_VG)
+  (NUBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad);
+ 
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_z_vgl,FEVAL_NUBSPLINE_1D_Z_VGL)
+  (NUBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_1d_z_vgh,FEVAL_NUBSPLINE_1D_Z_VGH)
+  (NUBspline_1d_z **spline, double *x, 
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+//////////////////////////////
+// 2D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s,FEVAL_NUBSPLINE_2D_S)
+  (NUBspline_2d_s **spline, double *x, double *y, float *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s_vg,FEVAL_NUBSPLINE_2D_S_VG)
+  (NUBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s_vgl,FEVAL_NUBSPLINE_2D_S_VGL)
+  (NUBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float* lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_s_vgh,FEVAL_NUBSPLINE_2D_S_VGH)
+  (NUBspline_2d_s **spline, double *x, double *y, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 2D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d,FEVAL_NUBSPLINE_2D_D)
+  (NUBspline_2d_d **spline, double *x, double *y, double *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d_vg,FEVAL_NUBSPLINE_2D_D_VG)
+  (NUBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d_vgl,FEVAL_NUBSPLINE_2D_D_VGL)
+  (NUBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_d_vgh,FEVAL_NUBSPLINE_2D_D_VGH)
+  (NUBspline_2d_d **spline, double *x, double *y, 
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 2D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c,FEVAL_NUBSPLINE_2D_C)
+  (NUBspline_2d_c **spline, double *x, double *y, complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c_vg,FEVAL_NUBSPLINE_2D_C_VG)
+  (NUBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c_vgl,FEVAL_NUBSPLINE_2D_C_VGL)
+  (NUBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_c_vgh,FEVAL_NUBSPLINE_2D_C_VGH)
+  (NUBspline_2d_c **spline, double *x, double *y, 
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 2D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z,FEVAL_NUBSPLINE_2D_Z)
+  (NUBspline_2d_z **spline, double *x, double *y, complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z_vg,FEVAL_NUBSPLINE_2D_Z_VG)
+  (NUBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z_vgl,FEVAL_NUBSPLINE_2D_Z_VGL)
+  (NUBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_2d_z_vgh,FEVAL_NUBSPLINE_2D_Z_VGH)
+  (NUBspline_2d_z **spline, double *x, double *y, 
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+//////////////////////////////
+// 3D single-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s,FEVAL_NUBSPLINE_3D_S)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s_vg,FEVAL_NUBSPLINE_3D_S_VG)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s_vgl,FEVAL_NUBSPLINE_3D_S_VGL)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z,
+   float *val, float *grad, float* lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_s_vgh,FEVAL_NUBSPLINE_3D_S_VGH)
+  (NUBspline_3d_s **spline, double *x, double *y, double *z, 
+   float *val, float *grad, float *hess);
+
+//////////////////////////////
+// 3D double-precision real //
+//////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d,FEVAL_NUBSPLINE_3D_D)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d_vg,FEVAL_NUBSPLINE_3D_D_VG)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d_vgl,FEVAL_NUBSPLINE_3D_D_VGL)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,  
+   double *val, double *grad, double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_d_vgh,FEVAL_NUBSPLINE_3D_D_VGH)
+  (NUBspline_3d_d **spline, double *x, double *y, double *z,
+   double *val, double *grad, double *hess);
+
+/////////////////////////////////
+// 3D single-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c,FEVAL_NUBSPLINE_3D_C)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c_vg,FEVAL_NUBSPLINE_3D_C_VG)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z, 
+   complex_float *val, complex_float *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c_vgl,FEVAL_NUBSPLINE_3D_C_VGL)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_c_vgh,FEVAL_NUBSPLINE_3D_C_VGH)
+  (NUBspline_3d_c **spline, double *x, double *y, double *z,
+   complex_float *val, complex_float *grad, complex_float *hess);
+
+/////////////////////////////////
+// 3D double-precision complex //
+/////////////////////////////////
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z,FEVAL_NUBSPLINE_3D_Z)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z_vg,FEVAL_NUBSPLINE_3D_Z_VG)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z, 
+   complex_double *val, complex_double *grad);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z_vgl,FEVAL_NUBSPLINE_3D_Z_VGL)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *lapl);
+
+CFUNC void
+F77_FUNC_(feval_nubspline_3d_z_vgh,FEVAL_NUBSPLINE_3D_Z_VGH)
+  (NUBspline_3d_z **spline, double *x, double *y, double *z,
+   complex_double *val, complex_double *grad, complex_double *hess);
+
+#endif
--- a/src/einspline/multi_bspline.h
+++ b/src/einspline/multi_bspline.h
@ -0,0 +1,40 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef MULTI_BSPLINE_H
+#define MULTI_BSPLINE_H
+
+#include "bspline_base.h"
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////           Bspline structure definitions            ////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+#include "multi_bspline_structs.h"
+
+// Currently, some of the single-precision routines use SSE2 instructions
+#include "multi_bspline_eval_s.h"
+#include "multi_bspline_eval_c.h"
+#include "multi_bspline_eval_d.h"
+#include "multi_bspline_eval_z.h"
+
+#include "bspline_create.h"
+#include "multi_bspline_create.h"
+#endif
--- a/src/einspline/multi_bspline_create.c
+++ b/src/einspline/multi_bspline_create.c
--- a/src/einspline/multi_bspline_create.h
+++ b/src/einspline/multi_bspline_create.h
@ -0,0 +1,177 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef MULTI_BSPLINE_CREATE_H
+#define MULTI_BSPLINE_CREATE_H
+
+#include "bspline_base.h"
+#include "multi_bspline_structs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////              Spline creation functions             ////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+
+/////////////////////////////////////
+// Uniform, single precision, real //
+/////////////////////////////////////
+// Create 1D uniform single-precision, real Bspline
+multi_UBspline_1d_s *
+create_multi_UBspline_1d_s (Ugrid x_grid, BCtype_s xBC, int num_splines);
+
+// Create 2D uniform single-precision, real Bspline
+multi_UBspline_2d_s *
+create_multi_UBspline_2d_s (Ugrid x_grid,   Ugrid y_grid,
+			    BCtype_s   xBC, BCtype_s   yBC,
+			    int num_splines);
+
+// Create 3D uniform single-precision, real Bspline
+multi_UBspline_3d_s *
+create_multi_UBspline_3d_s (Ugrid x_grid,   Ugrid y_grid,   Ugrid z_grid,
+			    BCtype_s  xBC,  BCtype_s   yBC, BCtype_s   zBC,
+			    int num_splines);
+  
+// Set the data for the splines, and compute spline coefficients
+void
+set_multi_UBspline_1d_s (multi_UBspline_1d_s *spline, 
+			 int spline_num, float *data);
+
+void
+set_multi_UBspline_2d_s (multi_UBspline_2d_s *spline, 
+			 int spline_num, float *data);
+
+void
+set_multi_UBspline_3d_s (multi_UBspline_3d_s *spline, 
+			 int spline_num, float *data);
+
+
+/////////////////////////////////////
+// Uniform, double precision, real //
+/////////////////////////////////////
+// Create 1D uniform single-precision, real Bspline
+multi_UBspline_1d_d *
+create_multi_UBspline_1d_d (Ugrid x_grid, BCtype_d xBC, int num_splines);
+
+// Create 2D uniform single-precision, real Bspline
+multi_UBspline_2d_d *
+create_multi_UBspline_2d_d (Ugrid x_grid,   Ugrid y_grid,
+			    BCtype_d   xBC, BCtype_d   yBC,
+			    int num_splines);
+
+// Create 3D uniform single-precision, real Bspline
+multi_UBspline_3d_d *
+create_multi_UBspline_3d_d (Ugrid x_grid,   Ugrid   y_grid,   Ugrid z_grid,
+			    BCtype_d  xBC,  BCtype_d   yBC, BCtype_d   zBC,
+			    int num_splines);
+
+// Set the data for the splines, and compute spline coefficients
+void
+set_multi_UBspline_1d_d (multi_UBspline_1d_d *spline, 
+			 int spline_num, double *data);
+void
+set_multi_UBspline_1d_d_BC (multi_UBspline_1d_d *spline, 
+			    int spline_num, double *data, BCtype_d xBC);
+
+void
+set_multi_UBspline_2d_d (multi_UBspline_2d_d *spline, 
+			 int spline_num, double *data);
+
+void
+set_multi_UBspline_3d_d (multi_UBspline_3d_d *spline, 
+			 int spline_num, double *data);
+
+///////////////////////////////////////
+// Uniform, single precision, complex//
+///////////////////////////////////////
+// Create 1D uniform single-precision, real Bspline
+multi_UBspline_1d_c *
+create_multi_UBspline_1d_c (Ugrid x_grid, BCtype_c xBC, int num_splines);
+
+// Create 2D uniform single-precision, real Bspline
+multi_UBspline_2d_c *
+create_multi_UBspline_2d_c (Ugrid   x_grid, Ugrid   y_grid,
+			    BCtype_c   xBC, BCtype_c   yBC,
+			    int num_splines);
+  
+// Create 3D uniform single-precision, real Bspline
+multi_UBspline_3d_c *
+create_multi_UBspline_3d_c (Ugrid  x_grid, Ugrid y_grid, Ugrid z_grid,
+			    BCtype_c  xBC, BCtype_c yBC, BCtype_c zBC,
+			    int num_splines);
+
+// Set the data for the splines, and compute spline coefficients
+void
+set_multi_UBspline_1d_c (multi_UBspline_1d_c *spline, int spline_num, 
+			 complex_float *data);
+
+void
+set_multi_UBspline_2d_c (multi_UBspline_2d_c *spline, int spline_num, 
+			 complex_float *data);
+
+void
+set_multi_UBspline_3d_c (multi_UBspline_3d_c *spline, int spline_num, 
+			 complex_float *data);
+
+///////////////////////////////////////
+// Uniform, double precision, complex//
+///////////////////////////////////////
+// Create 1D uniform double-precision, complex Bspline
+multi_UBspline_1d_z *
+create_multi_UBspline_1d_z (Ugrid x_grid, BCtype_z xBC, int num_splines);
+
+// Create 2D uniform double-precision, complex Bspline
+multi_UBspline_2d_z *
+create_multi_UBspline_2d_z (Ugrid x_grid, Ugrid y_grid,
+			    BCtype_z   xBC, BCtype_z   yBC,
+			    int num_splines);
+
+// Create 3D uniform double-precision, complex Bspline
+multi_UBspline_3d_z *
+create_multi_UBspline_3d_z (Ugrid  x_grid, Ugrid   y_grid, Ugrid z_grid,
+			    BCtype_z  xBC, BCtype_z   yBC, BCtype_z zBC,
+			    int num_splines);
+
+// Set the data for the splines, and compute spline coefficients
+void
+set_multi_UBspline_1d_z (multi_UBspline_1d_z *spline, int spline_num, 
+			 complex_double *data);
+void
+set_multi_UBspline_1d_z_BC (multi_UBspline_1d_z *spline, int spline_num, 
+			    complex_double *data, BCtype_z xBC);
+
+
+void
+set_multi_UBspline_2d_z (multi_UBspline_2d_z *spline, int spline_num, 
+			 complex_double *data);
+
+void
+set_multi_UBspline_3d_z (multi_UBspline_3d_z *spline, int spline_num, 
+			 complex_double *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/einspline/multi_bspline_create_cuda.cu
+++ b/src/einspline/multi_bspline_create_cuda.cu
@ -0,0 +1,735 @@
+#include <stdio.h>
+
+#include "multi_bspline.h"
+#include "multi_bspline_structs_cuda.h"
+
+__device__ double Bcuda[48];
+__constant__ float  Acuda[48];
+
+#include "multi_bspline_cuda_s_impl.h"
+#include "multi_bspline_cuda_c_impl.h"
+#include "multi_bspline_cuda_d_impl.h"
+#include "multi_bspline_cuda_z_impl.h"
+
+#define COALLESCED_SIZE 16
+
+extern "C" multi_UBspline_1d_s_cuda*
+create_multi_UBspline_1d_s_cuda (multi_UBspline_1d_s* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_1d_s_cuda *cuda_spline =
+    (multi_UBspline_1d_s_cuda*) malloc (sizeof (multi_UBspline_1d_s_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int N = spline->num_splines;
+
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride = N;
+  cuda_spline->gridInv = spline->x_grid.delta_inv;
+  cuda_spline->dim = spline->x_grid.num;
+
+  size_t size = Nx*N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  float *spline_buff = (float*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int isp=0; isp<spline->num_splines; isp++) 
+      spline_buff[ix*cuda_spline->stride + isp] =
+	spline->coefs[ix*spline->x_stride + isp];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  
+  free(spline_buff);
+  
+  return cuda_spline;
+}
+
+
+extern "C" multi_UBspline_1d_s_cuda*
+create_multi_UBspline_1d_s_cuda_conv (multi_UBspline_1d_d* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_1d_s_cuda *cuda_spline =
+    (multi_UBspline_1d_s_cuda*) malloc (sizeof (multi_UBspline_1d_s_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int N = spline->num_splines;
+
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride = N;
+  cuda_spline->gridInv = spline->x_grid.delta_inv;
+  cuda_spline->dim = spline->x_grid.num;
+
+  size_t size = Nx*N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  float *spline_buff = (float*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int isp=0; isp<spline->num_splines; isp++) 
+      spline_buff[ix*cuda_spline->stride + isp] = 
+	(float)spline->coefs[ix*spline->x_stride + isp];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  
+  free(spline_buff);
+  
+  return cuda_spline;
+}
+
+
+
+extern "C" multi_UBspline_1d_c_cuda*
+create_multi_UBspline_1d_c_cuda (multi_UBspline_1d_c* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_1d_c_cuda *cuda_spline =
+    (multi_UBspline_1d_c_cuda*) malloc (sizeof (multi_UBspline_1d_c_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int N = spline->num_splines;
+
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride = N;
+  cuda_spline->gridInv = spline->x_grid.delta_inv;
+  cuda_spline->dim = spline->x_grid.num;
+
+  size_t size = Nx*N*sizeof(complex_float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  complex_float *spline_buff = (complex_float*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int isp=0; isp<spline->num_splines; isp++) 
+      spline_buff[ix*cuda_spline->stride + isp] =
+	spline->coefs[ix*spline->x_stride + isp];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  
+  free(spline_buff);
+  
+  return cuda_spline;
+}
+
+
+extern "C" multi_UBspline_1d_c_cuda*
+create_multi_UBspline_1d_c_cuda_conv (multi_UBspline_1d_z* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "Error copying A matrix to GPU constant memory:  Erorr = %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+  multi_UBspline_1d_c_cuda *cuda_spline =
+    (multi_UBspline_1d_c_cuda*) malloc (sizeof (multi_UBspline_1d_c_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int N = spline->num_splines;
+
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride = N;
+  cuda_spline->gridInv = spline->x_grid.delta_inv;
+  cuda_spline->dim = spline->x_grid.num;
+
+  size_t size = Nx*N*sizeof(complex_float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  complex_float *spline_buff = (complex_float*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int isp=0; isp<spline->num_splines; isp++) 
+      spline_buff[ix*cuda_spline->stride + isp] =
+	spline->coefs[ix*spline->x_stride + isp];
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  
+  free(spline_buff);
+  
+  return cuda_spline;
+}
+
+
+
+
+extern "C" multi_UBspline_3d_c_cuda*
+create_multi_UBspline_3d_c_cuda (multi_UBspline_3d_c* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_c_cuda *cuda_spline =
+    (multi_UBspline_3d_c_cuda*) malloc (sizeof (multi_UBspline_3d_c_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*Nz*N*sizeof(std::complex<float>);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) {
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp];
+	}
+	for (int isp=spline->num_splines; isp < N; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] = 0.0;
+	}
+
+      }
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  free(spline_buff);
+
+  cuda_spline->stride.x = 2*Ny*Nz*N;
+  cuda_spline->stride.y = 2*Nz*N;
+  cuda_spline->stride.z = 2*N;
+
+
+  return cuda_spline;
+}
+
+
+extern "C" multi_UBspline_3d_c_cuda*
+create_multi_UBspline_3d_c_cuda_conv (multi_UBspline_3d_z* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_c_cuda *cuda_spline =
+    (multi_UBspline_3d_c_cuda*) malloc (sizeof (multi_UBspline_3d_c_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*Nz*N*sizeof(std::complex<float>);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "Failed to allocate %ld memory for GPU spline coefficients.  Error %s\n",
+	     size, cudaGetErrorString(err));
+    abort();
+  }
+ 
+  std::complex<float> *spline_buff = (std::complex<float>*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) {
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  std::complex<double> z = spline->coefs[ix*spline->x_stride +
+						 iy*spline->y_stride +
+						 iz*spline->z_stride + isp];
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] = std::complex<float>(z.real(), z.imag());
+	}
+	for (int isp=spline->num_splines; isp < N; isp++) 
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] = 0.0;
+      }
+
+	
+
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  cudaThreadSynchronize();
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "Failed to copy spline to GPU memory.  Error:  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+  free(spline_buff);
+
+  cuda_spline->stride.x = 2*Ny*Nz*N;
+  cuda_spline->stride.y = 2*Nz*N;
+  cuda_spline->stride.z = 2*N;
+
+
+  return cuda_spline;
+}
+
+
+
+
+extern "C" multi_UBspline_3d_s_cuda*
+create_multi_UBspline_3d_s_cuda (multi_UBspline_3d_s* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_s_cuda *cuda_spline =
+    (multi_UBspline_3d_s_cuda*) malloc (sizeof (multi_UBspline_3d_s_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*Nz*N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  float *spline_buff = (float*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp];
+	}
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  free(spline_buff);
+
+  return cuda_spline;
+}
+
+
+
+extern "C" multi_UBspline_3d_s_cuda*
+create_multi_UBspline_3d_s_cuda_conv (multi_UBspline_3d_d* spline)
+{
+  fprintf (stderr, "In create_multi_UBspline_3d_s_cuda_conv.\n");
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_s_cuda *cuda_spline =
+    (multi_UBspline_3d_s_cuda*) malloc (sizeof (multi_UBspline_3d_s_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*Nz*N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "Failed to allocate %ld memory for GPU spline coefficients.  Error %s\n",
+	     size, cudaGetErrorString(err));
+    abort();
+  }
+  float *spline_buff = (float*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp];
+	  // if (isnan (spline->coefs[ix*spline->x_stride +
+	  // 			   iy*spline->y_stride +
+	  // 			   iz*spline->z_stride + isp]))
+	  //    fprintf (stderr, "NAN at ix=%d iy=%d iz=%d isp=%d\n",
+	  //    	     ix,iy,iz,isp);
+	}
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+  cudaThreadSynchronize();
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "Failed to copy spline to GPU memory.  Error:  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+  free(spline_buff);
+
+  return cuda_spline;
+}
+
+
+
+
+extern "C" multi_UBspline_3d_d_cuda*
+create_multi_UBspline_3d_d_cuda (multi_UBspline_3d_d* spline)
+{
+  double B_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_d_cuda *cuda_spline =
+    (multi_UBspline_3d_d_cuda*) malloc (sizeof (multi_UBspline_3d_d_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+  size_t size = Nx*Ny*Nz*N*sizeof(double);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  double *spline_buff = (double*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp];
+	}
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  free(spline_buff);
+
+  return cuda_spline;
+}
+
+
+
+extern "C" multi_UBspline_3d_z_cuda*
+create_multi_UBspline_3d_z_cuda (multi_UBspline_3d_z* spline)
+{
+  double B_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Bcuda, B_h, 48*sizeof(double), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_z_cuda *cuda_spline =
+    (multi_UBspline_3d_z_cuda*) malloc (sizeof (multi_UBspline_3d_z_cuda));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%COALLESCED_SIZE) != 0)
+    N += COALLESCED_SIZE - (N%COALLESCED_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  cuda_spline->dim.x = spline->x_grid.num;
+  cuda_spline->dim.y = spline->y_grid.num;
+  cuda_spline->dim.z = spline->z_grid.num;
+
+
+  size_t size = Nx*Ny*Nz*N*sizeof(std::complex<double>);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  std::complex<double> *spline_buff = (std::complex<double>*)malloc(size);
+  if (!spline_buff) {
+    fprintf (stderr, "Failed to allocate memory for temporary spline buffer.\n");
+    abort();
+  }
+
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp];
+	}
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  cuda_spline->stride.x = 2*Ny*Nz*N;
+  cuda_spline->stride.y = 2*Nz*N;
+  cuda_spline->stride.z = 2*N;
+
+  free(spline_buff);
+
+  return cuda_spline;
+}
--- a/src/einspline/multi_bspline_create_cuda.h
+++ b/src/einspline/multi_bspline_create_cuda.h
@ -0,0 +1,80 @@
+#ifndef MULTI_BSPLINE_CREATE_CUDA_H
+#define MULTI_BSPLINE_CREATE_CUDA_H
+
+#include "multi_bspline_structs_cuda.h"
+
+
+////////
+// 1D //
+////////
+extern "C" multi_UBspline_1d_s_cuda*
+create_multi_UBspline_1d_s_cuda (multi_UBspline_1d_s* spline);
+
+extern "C" multi_UBspline_1d_s_cuda*
+create_multi_UBspline_1d_s_cuda_conv (multi_UBspline_1d_d* spline);
+
+
+extern "C" multi_UBspline_1d_c_cuda*
+create_multi_UBspline_1d_c_cuda (multi_UBspline_1d_c* spline);
+
+extern "C" multi_UBspline_1d_c_cuda*
+create_multi_UBspline_1d_c_cuda_conv (multi_UBspline_1d_z* spline);
+
+
+extern "C" multi_UBspline_1d_d_cuda*
+create_multi_UBspline_1d_d_cuda (multi_UBspline_1d_d* spline);
+
+extern "C" multi_UBspline_1d_z_cuda*
+create_multi_UBspline_1d_z_cuda (multi_UBspline_1d_z* spline);
+
+////////
+// 2D //
+////////
+extern "C" multi_UBspline_2d_s_cuda*
+create_multi_UBspline_2d_s_cuda (multi_UBspline_2d_s* spline);
+
+extern "C" multi_UBspline_2d_s_cuda*
+create_multi_UBspline_2d_s_cuda_conv (multi_UBspline_2d_d* spline);
+
+
+extern "C" multi_UBspline_2d_c_cuda*
+create_multi_UBspline_2d_c_cuda (multi_UBspline_2d_c* spline);
+
+extern "C" multi_UBspline_2d_c_cuda*
+create_multi_UBspline_2d_c_cuda_conv (multi_UBspline_2d_z* spline);
+
+
+extern "C" multi_UBspline_2d_d_cuda*
+create_multi_UBspline_2d_d_cuda (multi_UBspline_2d_d* spline);
+
+extern "C" multi_UBspline_2d_z_cuda*
+create_multi_UBspline_2d_z_cuda (multi_UBspline_2d_z* spline);
+
+
+
+
+////////
+// 3D //
+////////
+
+extern "C" multi_UBspline_3d_s_cuda*
+create_multi_UBspline_3d_s_cuda (multi_UBspline_3d_s* spline);
+
+extern "C" multi_UBspline_3d_s_cuda*
+create_multi_UBspline_3d_s_cuda_conv (multi_UBspline_3d_d* spline);
+
+
+extern "C" multi_UBspline_3d_c_cuda*
+create_multi_UBspline_3d_c_cuda (multi_UBspline_3d_c* spline);
+
+extern "C" multi_UBspline_3d_c_cuda*
+create_multi_UBspline_3d_c_cuda_conv (multi_UBspline_3d_z* spline);
+
+
+extern "C" multi_UBspline_3d_d_cuda*
+create_multi_UBspline_3d_d_cuda (multi_UBspline_3d_d* spline);
+
+extern "C" multi_UBspline_3d_z_cuda*
+create_multi_UBspline_3d_z_cuda (multi_UBspline_3d_z* spline);
+
+#endif
--- a/src/einspline/multi_bspline_cuda_c.cu
+++ b/src/einspline/multi_bspline_cuda_c.cu
@ -0,0 +1,640 @@
+#define BLOCK_SIZE 64
+
+#include "multi_bspline.h"
+#include "multi_bspline_create_cuda.h"
+
+//__constant__ float A[48];
+
+// typedef struct
+// {
+//   float *coefs_real, *coefs_imag;
+//   uint3 stride;
+//   float3 gridInv;
+//   int num_splines;
+// } multi_UBspline_3d_c_cuda;
+
+#ifndef NO_CUDA_MAIN
+extern "C" multi_UBspline_3d_c_cuda*
+create_multi_UBspline_3d_c_cuda (multi_UBspline_3d_c* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(A, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_c_cuda *cuda_spline =
+    (multi_UBspline_3d_c_cuda*) malloc (sizeof (multi_UBspline_3d_c_cuda*));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%BLOCK_SIZE) != 0)
+    N += 64 - (N%BLOCK_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  size_t size = Nx*Ny*Nz+N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs_real), size);
+  cudaMalloc((void**)&(cuda_spline->coefs_imag), size);
+  
+  float *spline_buff = (float*)malloc(size);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp].real();
+	}
+  cudaMemcpy(cuda_spline->coefs_real, spline_buff, size, cudaMemcpyHostToDevice);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp].imag();
+	}
+  cudaMemcpy(cuda_spline->coefs_imag, spline_buff, size, cudaMemcpyHostToDevice);
+
+  free(spline_buff);
+
+  return cuda_spline;
+}
+#endif
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_c_cuda (float *pos, float3 drInv, 
+				     const float *coefs_real, const float *coefs_imag,
+				     float *vals[], uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64], coefs[2*BLOCK_SIZE];
+
+  // __shared__ float pos_s[BLOCK_SIZE];
+  // int ir1 = (ir >> 4)*64;
+  // int ir2 = (ir & 15)*4;
+  // pos_s[thr] = pos[ir1+thr];
+  // __syncthreads();
+  // float3 r;
+  // r.x = pos_s[ir2+0];
+  // r.y = pos_s[ir2+1];
+  // r.z = pos_s[ir2+2];
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[4*ir+0];
+    r.y = pos[4*ir+1];
+    r.z = pos[4*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+
+  float val_real = 0.0;
+  float val_imag = 0.0;
+  val_real = val_imag = 0.0;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      for (int k=0; k<4; k++) {
+  	val_real += abc[16*i+4*j+k] * base_real[off+k*strides.z];
+  	val_imag += abc[16*i+4*j+k] * base_imag[off+k*strides.z];
+      }
+    }
+  }
+  // for (int i=0; i<4; i++) {
+  //   for (int j=0; j<4; j++) {
+  //     float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+  //     float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+  //     for (int k=0; k<4; k++) {
+  // 	coefs[thr]            = base_real[(2*block+0)*BLOCK_SIZE+thr];
+  // 	coefs[thr+BLOCK_SIZE] = base_real[(2*block+1)*BLOCK_SIZE+thr];
+  // 	__syncthreads();
+  // 	val_real += abc[16*i+4*j+k] * coefs[2*thr+0];
+  // 	val_imag += abc[16*i+4*j+k] * coefs[2*thr+1];
+  //     }
+  //   }
+  // }
+  __shared__ float buff[2*BLOCK_SIZE];
+  buff[2*thr+0] = val_real;
+  buff[2*thr+1] = val_imag;
+  __syncthreads();
+  myval[off] = buff[thr];
+  myval[off+BLOCK_SIZE] = buff[thr+BLOCK_SIZE];
+
+//   myval[2*off+0] = val_real;
+//   myval[2*off+1] = val_imag;
+  //myval[off+BLOCK_SIZE] = val_imag;
+  //vals_real[ir][offset] = val_real;
+  //vals_imag[ir][offset] = val_imag;
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_c_vgh_cuda (float *pos, float3 drInv, 
+					 const float *coefs_real, const float *coefs_imag,
+					 float *vals[], float *grads[], float *hess[],
+					 uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*BLOCK_SIZE+thr;
+
+  __shared__ float *myval, *mygrad, *myhess;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[4*ir+0];
+    r.y = pos[4*ir+1];
+    r.z = pos[4*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[10*(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[10*(16*i+4*j+k)+1] = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[10*(16*i+4*j+k)+2] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[10*(16*i+4*j+k)+3] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[10*(16*i+4*j+k)+4] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[10*(16*i+4*j+k)+5] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[10*(16*i+4*j+k)+6] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[10*(16*i+4*j+k)+7] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[10*(16*i+4*j+k)+8] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[10*(16*i+4*j+k)+9] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v_r = 0.0;
+  float v_i = 0.0;
+  float g0_r=0.0, g0_i=0.0, g1_r=0.0, g1_i=0.0, g2_r=0.0, g2_i=0.0, 
+    h00_r=0.0, h00_i=0.0, h01_r=0.0, h01_i=0.0, h02_r=0.0, h02_i=0.0, 
+    h11_r=0.0, h11_i=0.0, h12_r=0.0, h12_i=0.0, h22_r=0.0, h22_i=0.0;
+  int n = 0;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+//       float c0_r, c0_i, c1_r, c1_i, c2_r, c2_i, c3_r, c3_i;
+//       c0_r = base_real[off+0*strides.z];  c0_i = base_imag[off+0*strides.z];
+//       c1_r = base_real[off+1*strides.z];  c1_i = base_imag[off+1*strides.z];
+//       c2_r = base_real[off+2*strides.z];  c2_i = base_imag[off+2*strides.z];
+//       c3_r = base_real[off+3*strides.z];  c3_i = base_imag[off+3*strides.z];
+
+//       v_r   += abc[n+0] * c0_r;  v_i   += abc[n+0] * c0_i;
+//       g0_r  += abc[n+1] * c0_r;  g0_i  += abc[n+1] * c0_i;
+//       g1_r  += abc[n+2] * c0_r;  g1_i  += abc[n+2] * c0_i;
+//       g2_r  += abc[n+3] * c0_r;  g2_i  += abc[n+3] * c0_i;
+//       h00_r += abc[n+4] * c0_r;  h00_i += abc[n+4] * c0_i;
+//       h01_r += abc[n+5] * c0_r;  h01_i += abc[n+5] * c0_i;
+//       h02_r += abc[n+6] * c0_r;  h02_i += abc[n+6] * c0_i;
+//       h11_r += abc[n+7] * c0_r;  h11_i += abc[n+7] * c0_i;
+//       h12_r += abc[n+8] * c0_r;  h12_i += abc[n+8] * c0_i;
+//       h22_r += abc[n+9] * c0_r;  h22_i += abc[n+9] * c0_i;       
+
+//       v_r   += abc[n+10] * c1_r;  v_i   += abc[n+10] * c1_i;
+//       g0_r  += abc[n+11] * c1_r;  g0_i  += abc[n+11] * c1_i;
+//       g1_r  += abc[n+12] * c1_r;  g1_i  += abc[n+12] * c1_i;
+//       g2_r  += abc[n+13] * c1_r;  g2_i  += abc[n+13] * c1_i;
+//       h00_r += abc[n+14] * c1_r;  h00_i += abc[n+14] * c1_i;
+//       h01_r += abc[n+15] * c1_r;  h01_i += abc[n+15] * c1_i;
+//       h02_r += abc[n+16] * c1_r;  h02_i += abc[n+16] * c1_i;
+//       h11_r += abc[n+17] * c1_r;  h11_i += abc[n+17] * c1_i;
+//       h12_r += abc[n+18] * c1_r;  h12_i += abc[n+18] * c1_i;
+//       h22_r += abc[n+19] * c1_r;  h22_i += abc[n+19] * c1_i;       
+
+//       v_r   += abc[n+20] * c2_r;  v_i   += abc[n+20] * c2_i;
+//       g0_r  += abc[n+21] * c2_r;  g0_i  += abc[n+21] * c2_i;
+//       g1_r  += abc[n+22] * c2_r;  g1_i  += abc[n+22] * c2_i;
+//       g2_r  += abc[n+23] * c2_r;  g2_i  += abc[n+23] * c2_i;
+//       h00_r += abc[n+24] * c2_r;  h00_i += abc[n+24] * c2_i;
+//       h01_r += abc[n+25] * c2_r;  h01_i += abc[n+25] * c2_i;
+//       h02_r += abc[n+26] * c2_r;  h02_i += abc[n+26] * c2_i;
+//       h11_r += abc[n+27] * c2_r;  h11_i += abc[n+27] * c2_i;
+//       h12_r += abc[n+28] * c2_r;  h12_i += abc[n+28] * c2_i;
+//       h22_r += abc[n+29] * c2_r;  h22_i += abc[n+29] * c2_i;       
+
+//       v_r   += abc[n+30] * c3_r;  v_i   += abc[n+30] * c3_i;
+//       g0_r  += abc[n+31] * c3_r;  g0_i  += abc[n+31] * c3_i;
+//       g1_r  += abc[n+32] * c3_r;  g1_i  += abc[n+32] * c3_i;
+//       g2_r  += abc[n+33] * c3_r;  g2_i  += abc[n+33] * c3_i;
+//       h00_r += abc[n+34] * c3_r;  h00_i += abc[n+34] * c3_i;
+//       h01_r += abc[n+35] * c3_r;  h01_i += abc[n+35] * c3_i;
+//       h02_r += abc[n+36] * c3_r;  h02_i += abc[n+36] * c3_i;
+//       h11_r += abc[n+37] * c3_r;  h11_i += abc[n+37] * c3_i;
+//       h12_r += abc[n+38] * c3_r;  h12_i += abc[n+38] * c3_i;
+//       h22_r += abc[n+39] * c3_r;  h22_i += abc[n+39] * c3_i;       
+//       n += 40;
+
+      for (int k=0; k<4; k++) {
+	float cr = base_real[off+k*strides.z];
+	float ci = base_imag[off+k*strides.z];
+	v_r   += abc[n+0] * cr;  v_i   += abc[n+0] * ci;
+	g0_r  += abc[n+1] * cr;  g0_i  += abc[n+1] * ci;
+	g1_r  += abc[n+2] * cr;  g1_i  += abc[n+2] * ci;
+	g2_r  += abc[n+3] * cr;  g2_i  += abc[n+3] * ci;
+	h00_r += abc[n+4] * cr;  h00_i += abc[n+4] * ci;
+	h01_r += abc[n+5] * cr;  h01_i += abc[n+5] * ci;
+	h02_r += abc[n+6] * cr;  h02_i += abc[n+6] * ci;
+	h11_r += abc[n+7] * cr;  h11_i += abc[n+7] * ci;
+	h12_r += abc[n+8] * cr;  h12_i += abc[n+8] * ci;
+	h22_r += abc[n+9] * cr;  h22_i += abc[n+9] * ci; 
+	n += 10;
+      }
+    }
+  }
+  g0_r *= drInv.x; g0_i *= drInv.x;
+  g1_r *= drInv.y; g1_i *= drInv.y;
+  g2_r *= drInv.z; g2_i *= drInv.z;
+
+  h00_r *= drInv.x * drInv.x;  h00_i *= drInv.x * drInv.x;
+  h01_r *= drInv.x * drInv.y;  h01_i *= drInv.x * drInv.y;
+  h02_r *= drInv.x * drInv.z;  h02_i *= drInv.x * drInv.z;
+  h11_r *= drInv.y * drInv.y;  h11_i *= drInv.y * drInv.y;
+  h12_r *= drInv.y * drInv.z;  h12_i *= drInv.y * drInv.z;
+  h22_r *= drInv.z * drInv.z;  h22_i *= drInv.z * drInv.z;
+
+  
+  __shared__ float buff[6*BLOCK_SIZE];
+  // Note, we can reuse abc, by replacing buff with abc.
+  
+  buff[2*thr+0] = v_r;  buff[2*thr+1] = v_i;
+  __syncthreads();
+  myval[off] = buff[thr];    
+  myval[off+BLOCK_SIZE] = buff[thr+BLOCK_SIZE];
+
+  buff[6*thr+0] = g0_r;  buff[6*thr+1] = g0_i;
+  buff[6*thr+2] = g1_r;  buff[6*thr+3] = g1_i;
+  buff[6*thr+4] = g2_r;  buff[6*thr+5] = g2_i;
+  __syncthreads();
+  for (int i=0; i<6; i++) 
+    mygrad[(6*block+i)*BLOCK_SIZE+thr] = buff[i*BLOCK_SIZE+thr]; 
+  __syncthreads();
+
+  // Write first half of Hessians
+  if (thr < 32) {
+    buff[12*thr+0]  = h00_r;    buff[12*thr+1]  = h00_i;
+    buff[12*thr+2]  = h01_r;    buff[12*thr+3]  = h01_i;
+    buff[12*thr+4]  = h02_r;    buff[12*thr+5]  = h02_i;
+    buff[12*thr+6]  = h11_r;    buff[12*thr+7]  = h11_i;
+    buff[12*thr+8]  = h12_r;    buff[12*thr+9]  = h12_i;
+    buff[12*thr+10] = h22_r;    buff[12*thr+11] = h22_i;
+  }
+  __syncthreads();
+  if (thr < 32) 
+    for (int i=0; i<6; i++) 
+      myhess[(12*block+i)*BLOCK_SIZE+thr] = buff[i*BLOCK_SIZE+thr];
+
+  __syncthreads();
+  int th2 = thr-32;
+  if (thr >= 32) {
+    buff[12*th2+0]  = h00_r;    buff[12*th2+1]  = h00_i;
+    buff[12*th2+2]  = h01_r;    buff[12*th2+3]  = h01_i;
+    buff[12*th2+4]  = h02_r;    buff[12*th2+5]  = h02_i;
+    buff[12*th2+6]  = h11_r;    buff[12*th2+7]  = h11_i;
+    buff[12*th2+8]  = h12_r;    buff[12*th2+9]  = h12_i;
+    buff[12*th2+10] = h22_r;    buff[12*th2+11] = h22_i;
+  }
+  __syncthreads();
+  if (thr >= 32) {
+    for (int i=0; i<6; i++) 
+      myhess[(12*block+i+6)*BLOCK_SIZE+th2] = buff[i*BLOCK_SIZE+th2];
+  }
+
+}
+
+				    
+
+#ifndef NO_CUDA_MAIN
+static void *
+test_multi_cuda(void *thread)
+{
+//   CUcontext ctx;
+//   CUdevice dev;
+//   cuDeviceGet (&dev, (int)(size_t)thread);
+//   cuCtxCreate(&ctx, CU_CTX_SCHED_YIELD, dev);
+
+//   int deviceCount;
+//   cudaGetDeviceCount(&deviceCount);
+
+  cudaSetDevice((int)(size_t)thread);
+  fprintf (stderr, "In thread %p\n", thread);
+
+  int numWalkers = 200;
+  float *coefs  ,  __device__ *vals[numWalkers], *grads[numWalkers], *hess[numWalkers];
+  float *coefs_real_d, *coefs_imag_d, __device__ **vals_d, **grads_d, **hess_d;
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+  		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  // Copy A to host
+  cudaMemcpy(Acuda, A_h, 48*sizeof(float), cudaMemcpyHostToDevice); 
+
+  float *r_d, *r_h;
+  int xs, ys, zs, N;
+  int Nx, Ny, Nz;
+
+  N = 128;
+  Nx = Ny = Nz = 16;
+  xs = Ny*Nz*N;
+  ys = Nz*N;
+  zs = N;
+
+  float3 drInv;
+  drInv.x = 1.0/float(Nx);
+  drInv.y = 1.0/float(Ny);
+  drInv.z = 1.0/float(Nz);
+
+  // Setup Bspline coefficients
+  int size = Nx*Ny*Nz*N*sizeof(float);
+  posix_memalign((void**)&coefs, 16, size);
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++)
+	for (int n=0; n<N; n++)
+	  coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
+
+
+  fprintf (stderr, "Filled in coefs.\n");
+
+  // Setup values
+  //posix_memalign((void**)&vals, 16, N*sizeof(float));
+
+  // cudaMemcpy(r_d, r, numWalkers*sizeof(float3), cudaMemcpyHostToDevice);
+
+  
+  fprintf (stderr, "size = %d\n", size);
+  
+  // Setup CUDA coefficients
+  fprintf (stderr, "Before first CUDA mallocs.\n");
+  cudaMalloc((void**)&coefs_real_d, 2*size);
+  cudaMalloc((void**)&coefs_imag_d, 2*size);
+  fprintf (stderr, "Before Memcpy.\n");
+  cudaMemcpy(coefs_real_d, coefs, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(coefs_imag_d, coefs, size, cudaMemcpyHostToDevice);
+  fprintf (stderr, "After Memcpy.\n");  
+
+  // Setup device value storage
+  int numVals = 2*N*numWalkers*10;
+  float *valBlock_d, *valBlock_h;
+  cudaMalloc((void**)&(valBlock_d),     numVals*sizeof(float));
+  cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float));
+  cudaMalloc((void**)&(vals_d), 2*numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(grads_d), 2*numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(hess_d), 2*numWalkers*sizeof(float*));
+  fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
+  for (int i=0; i<numWalkers; i++) {
+    vals[i]  = valBlock_d + 2*i*N;
+    grads[i] = valBlock_d + 2*N*numWalkers + 6*i*N;
+    hess[i]  = valBlock_d + 8*N*numWalkers + 12*i*N;
+  }
+  cudaMemcpy(vals_d,  vals,  numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(grads_d, grads, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(hess_d,  hess,  numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  
+  fprintf (stderr, "Finished cuda allocations.\n");
+
+
+  // Setup walker positions
+  cudaMalloc((void**)&(r_d),     4*numWalkers*sizeof(float));
+  cudaMallocHost((void**)&(r_h), 4*numWalkers*sizeof(float));
+
+  for (int ir=0; ir<numWalkers; ir++) {
+    r_h[4*ir+0] = 0.5*drand48();
+    r_h[4*ir+1] = 0.5*drand48();
+    r_h[4*ir+2] = 0.5*drand48();
+  }
+
+  uint3 strides;
+  strides.x = xs;
+  strides.y = ys;
+  strides.z = zs;
+
+  dim3 dimBlock(BLOCK_SIZE);
+  dim3 dimGrid(N/BLOCK_SIZE,numWalkers);
+  
+  clock_t start, end;
+
+  start = clock();
+  for (int i=0; i<10000; i++) {
+    if ((i%1000) == 0) 
+      fprintf (stderr, "i = %d\n", i);
+    cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
+    eval_multi_multi_UBspline_3d_c_cuda<<<dimGrid,dimBlock>>> 
+       (r_d, drInv, coefs_real_d, coefs_imag_d, 
+        vals_d, strides);
+    // eval_multi_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>> 
+    //   (r_d, drInv, coefs_real_d, coefs_imag_d, 
+    //    valBlock_d, valBlock_d+numVals/2, strides);
+    //cudaMemcpy(valBlock_h, valBlock_d, numVals*sizeof(float), cudaMemcpyDeviceToHost);
+  }
+  end = clock();
+  double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
+  fprintf (stderr, "VGH evals per second = %1.8e\n", 1.0/time);
+
+
+  start = clock();
+  for (int i=0; i<10000; i++) {
+    if ((i%1000) == 0) 
+      fprintf (stderr, "i = %d\n", i);
+    cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
+    eval_multi_multi_UBspline_3d_c_vgh_cuda<<<dimGrid,dimBlock>>> 
+       (r_d, drInv, coefs_real_d, coefs_imag_d, 
+        vals_d, grads_d, hess_d, strides);
+  }
+  end = clock();
+  time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
+  fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
+  
+  cudaFree (valBlock_d);
+  cudaFree (vals_d);
+  cudaFree (coefs_real_d);
+  cudaFree (coefs_imag_d);
+  cudaFree (r_d);
+
+  return NULL;
+
+  // cudaMemcpy (vals, vals_d, N*sizeof(float), cudaMemcpyDeviceToHost);
+
+  // float vals2[N];
+  
+  // for (int n=0; n<N; n++) {
+  //   vals2[n] = 0.0;
+  //   int index=0;
+  //   for(int i=0; i<4; i++)
+  //     for (int j=0; j<4; j++)
+  // 	for (int k=0; k<4; k++)  {
+  // 	  vals2[n] += abc[index] * coefs[(ix+i)*xs+(iy+j)*ys+(iz+k)*zs+n];
+  // 	  index++;
+  // 	}
+  // }
+
+
+  // for (int i=0; i<N/256; i++)	
+  //   fprintf (stderr, "%1.9f %1.9f\n", vals[i], vals2[i]); 
+
+
+  // cudaFree(abc_d);
+  // cudaFree(coefs_d);
+  // cudaFree(vals_d);
+}
+#endif
+
+#ifndef NO_CUDA_MAIN
+
+main()
+{
+  int deviceCount;
+  cudaGetDeviceCount(&deviceCount);
+  fprintf (stderr, "Detected %d CUDA devices.\n", deviceCount);
+
+  // test_cuda();
+
+  for (int device = 0; device < deviceCount; ++device) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, device);
+    fprintf (stderr, "Device %d:\n", device);
+    fprintf (stderr, "  Global memory:     %10d\n",
+	     deviceProp.totalGlobalMem);
+    fprintf (stderr, "  MultiProcessors:   %10d\n",
+	     deviceProp.multiProcessorCount);
+    fprintf (stderr, "  Registers:         %10d\n", 
+	     deviceProp.regsPerBlock);
+    fprintf (stderr, "  Constant memory:   %10d\n", 
+	     deviceProp.totalConstMem);
+    fprintf (stderr, "  Shared memory:     %10d\n", 
+	     deviceProp.sharedMemPerBlock);
+  }
+
+  //  pthread_t threads[deviceCount];
+
+  // for (int device = 0; device < deviceCount; device++) 
+  //   pthread_create (&(threads[device]), NULL, test_multi_cuda, (void*)device);
+  // cutStartThread((CUT_THREADROUTINE)test_multi_cuda,(void*)device);
+  test_multi_cuda((void*)0);
+
+  //  pthread_exit(NULL);
+  //test_multi_cuda();
+}
+
+#endif
--- a/src/einspline/multi_bspline_cuda_c_impl.h
+++ b/src/einspline/multi_bspline_cuda_c_impl.h
@ -0,0 +1,924 @@
+#ifndef MULTI_BSPLINE_CUDA_C_IMPL_H
+#define MULTI_BSPLINE_CUDA_C_IMPL_H
+
+#include "multi_bspline.h"
+#include "multi_bspline_create_cuda.h"
+
+
+__global__ static void
+eval_multi_multi_UBspline_1d_c_kernel
+(float *pos, float drInv, const float *coefs, float **vals, 
+ uint dim, uint stride, int N)
+{
+  int tid   = threadIdx.x;
+  int ir    = blockIdx.x;
+
+  __shared__ float *ourval;
+  __shared__ float r;
+  if (tid == 0) {
+    r = pos[ir];
+    ourval = vals[ir];
+  }
+  __syncthreads();
+  
+  int index;
+  float t;
+  float s, sf;
+  float4 tp;
+
+  s = r * drInv;
+  sf = floor(s);
+  index = min(max(0,(int)sf), dim-1);
+  t = s - sf;
+  tp = make_float4(t*t*t, t*t, t, 1.0);
+
+  __shared__ float a[4];
+  if (tid < 4) 
+    a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
+  __syncthreads();
+
+  int numBlocks = 2*N / SPLINE_BLOCK_SIZE;
+  const float *c = coefs + index*stride + tid;
+  float *myval = ourval + tid;
+  int stride2 = 2*stride;
+  int stride3 = 3*stride;
+  for (int block=0; block < numBlocks; block++) {
+     *myval = (a[0] * c[0] +
+	       a[1] * c[stride] +
+	       a[2] * c[stride2] +
+	       a[3] * c[stride3]);
+     myval += SPLINE_BLOCK_SIZE;    c += SPLINE_BLOCK_SIZE;
+  }
+      
+  int remainder = 2*N - numBlocks*SPLINE_BLOCK_SIZE;
+  if (tid < remainder) {
+    *myval = (a[0] * c[0] +
+	      a[1] * c[stride] +
+	      a[2] * c[stride2] +
+	      a[3] * c[stride3]);
+  }
+}
+
+extern "C" void
+eval_multi_multi_UBspline_1d_c_cuda (const multi_UBspline_1d_c_cuda *spline,
+				     float *pos_d, float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(num);
+
+  eval_multi_multi_UBspline_1d_c_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (float*)spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_c_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_1d_c_vgl_kernel
+(float *pos, float drInv, const float *coefs, float **vals, 
+float **grads, float **lapl, 
+ uint dim, uint stride, int N)
+{
+  int tid   = threadIdx.x;
+  int ir    = blockIdx.x;
+
+  __shared__ float *ourval, *ourgrad, *ourlapl;
+  __shared__ float r;
+  if (tid == 0) {
+    r = pos[ir];
+    ourval = vals[ir];
+    ourgrad = grads[ir];
+    ourlapl = lapl[ir];
+  }
+  __syncthreads();
+  
+  int index;
+  float t;
+  float s, sf;
+  float4 tp;
+
+  s = r * drInv;
+  sf = floor(s);
+  index = min(max(0,(int)sf), dim-1);
+  t = s - sf;
+  tp = make_float4(t*t*t, t*t, t, 1.0);
+
+  __shared__ float a[12];
+  if (tid < 12) 
+    a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
+  __syncthreads();
+
+  int numBlocks = 2*N / SPLINE_BLOCK_SIZE;
+  const float *c = coefs + index*stride + tid;
+  float *myval  = ourval + tid;
+  float *mygrad = ourgrad + tid;
+  float *mylapl = ourlapl + tid;
+  int stride2 = 2*stride;
+  int stride3 = 3*stride;
+  __shared__ float coef[SPLINE_BLOCK_SIZE][5];
+  for (int block=0; block < numBlocks; block++) {
+    coef[tid][0] = c[0];
+    coef[tid][1] = c[stride];
+    coef[tid][2] = c[stride2];
+    coef[tid][3] = c[stride3];
+    *myval = (a[0] * coef[tid][0]   + a[1] * coef[tid][1] +
+	      a[2] * coef[tid][2]   + a[3] * coef[tid][3]);
+    *mygrad = (a[4] * coef[tid][0]  + a[5] * coef[tid][1] +
+	       a[6] * coef[tid][2]  + a[7] * coef[tid][3]);
+    *mylapl = (a[8] * coef[tid][0]  + a[9] * coef[tid][1] +
+	       a[10] * coef[tid][2] + a[11]* coef[tid][3]);
+    myval  += SPLINE_BLOCK_SIZE;    
+    mygrad += SPLINE_BLOCK_SIZE;    
+    mylapl += SPLINE_BLOCK_SIZE;    
+    c += SPLINE_BLOCK_SIZE;
+  }
+      
+  int remainder = 2*N - numBlocks*SPLINE_BLOCK_SIZE;
+  if (tid < remainder) {
+    *myval = (a[0] * c[0] +
+	      a[1] * c[stride] +
+	      a[2] * c[stride2] +
+	      a[3] * c[stride3]);
+  }
+}
+
+extern "C" void
+eval_multi_multi_UBspline_1d_c_vgl_cuda (const multi_UBspline_1d_c_cuda *spline,
+					 float *pos_d, float *vals_d[], 
+					 float *grads_d[], float *lapl_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(num);
+
+  eval_multi_multi_UBspline_1d_c_vgl_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (float*)spline->coefs, vals_d, grads_d, lapl_d,
+     spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_c_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_c_kernel 
+(float *pos, float3 drInv, const float *coefs, float *vals[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  //index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  //index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  //index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  if (off < 2*N) {
+    float val = 0.0;
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+	for (int k=0; k<4; k++) 
+	  val += abc[16*i+4*j+k] * base[off+k*strides.z];
+      }
+    }
+    myval[off] = val;
+  }
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_c_vgh_kernel 
+(float *pos, float3 drInv,  const float *coefs, 
+ float *vals[], float *grads[], float *hess[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad, *myhess;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < 2*N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const float *base = b0 + i*strides.x + j*strides.y;
+	float c0  = base[0*strides.z];
+	float c1  = base[1*strides.z];
+	float c2  = base[2*strides.z];
+	float c3  = base[3*strides.z];
+	v   += abc[n+  0]*c0 + abc[n+  1]*c1  + abc[n+  2]*c2  + abc[n+  3]*c3;
+	g0  += abc[n+ 64]*c0 + abc[n+ 65]*c1  + abc[n+ 66]*c2  + abc[n+ 67]*c3;
+	g1  += abc[n+128]*c0 + abc[n+129]*c1  + abc[n+130]*c2  + abc[n+131]*c3;
+	g2  += abc[n+192]*c0 + abc[n+193]*c1  + abc[n+194]*c2  + abc[n+195]*c3;
+	h00 += abc[n+256]*c0 + abc[n+257]*c1  + abc[n+258]*c2  + abc[n+259]*c3;
+	h01 += abc[n+320]*c0 + abc[n+321]*c1  + abc[n+322]*c2  + abc[n+323]*c3;
+	h02 += abc[n+384]*c0 + abc[n+385]*c1  + abc[n+386]*c2  + abc[n+387]*c3;
+	h11 += abc[n+448]*c0 + abc[n+449]*c1  + abc[n+450]*c2  + abc[n+451]*c3;
+	h12 += abc[n+512]*c0 + abc[n+513]*c1  + abc[n+514]*c2  + abc[n+515]*c3;
+	h22 += abc[n+576]*c0 + abc[n+577]*c1  + abc[n+578]*c2  + abc[n+579]*c3;
+	n += 4;
+	// for (int k=0; k<4; k++) {
+	//   float c  = base[k*strides.z];
+	//   v   += abc[n+0] * c;
+	//   g0  += abc[n+64] * c;
+	//   g1  += abc[n+128] * c;
+	//   g2  += abc[n+192] * c;
+	//   h00 += abc[n+256] * c;
+	//   h01 += abc[n+320] * c;
+	//   h02 += abc[n+384] * c;
+	//   h11 += abc[n+448] * c;
+	//   h12 += abc[n+512] * c;
+	//   h22 += abc[n+576] * c;
+	//   n += 1;
+	// }
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+  abc[3*thr+0] = g0; 
+  abc[3*thr+1] = g1; 
+  abc[3*thr+2] = g2; 
+  __syncthreads();
+  for (int i=0; i<3; i++) {
+    int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 6*N)
+      mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr]; 
+  }
+  __syncthreads();
+
+  // Write Hessians
+  abc[6*thr+0]  = h00;
+  abc[6*thr+1]  = h01;
+  abc[6*thr+2]  = h02;
+  abc[6*thr+3]  = h11;
+  abc[6*thr+4]  = h12;
+  abc[6*thr+5]  = h22;
+  __syncthreads();
+  for (int i=0; i<6; i++) {
+    int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 12*N)
+      myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_c_cuda (const multi_UBspline_3d_c_cuda *spline,
+				     float *pos_d, complex_float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+  
+
+  eval_multi_multi_UBspline_3d_c_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (float*)spline->coefs, (float**)vals_d, spline->dim, spline->stride, spline->num_splines);
+
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_c_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+}
+
+extern "C" void
+eval_multi_multi_UBspline_3d_c_vgh_cuda (const multi_UBspline_3d_c_cuda *spline,
+					 float *pos_d, complex_float *vals_d[], complex_float *grads_d[],
+					 complex_float *hess_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if ((2*spline->num_splines) % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_c_vgh_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (float*)spline->coefs, 
+     (float**)vals_d, (float**)grads_d, (float**)hess_d,
+     spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_c_vgh_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_c_vgl_kernel 
+(float *pos, float3 drInv,  const float *coefs,  float Linv[],
+ float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
+ int N, int row_stride)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad_lapl;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad_lapl = grad_lapl[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < 2*N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const float *base = b0 + i*strides.x + j*strides.y;
+	float c0  = base[0*strides.z];
+	float c1  = base[1*strides.z];
+	float c2  = base[2*strides.z];
+	float c3  = base[3*strides.z];
+	v   += abc[n+  0]*c0 + abc[n+  1]*c1  + abc[n+  2]*c2  + abc[n+  3]*c3;
+	g0  += abc[n+ 64]*c0 + abc[n+ 65]*c1  + abc[n+ 66]*c2  + abc[n+ 67]*c3;
+	g1  += abc[n+128]*c0 + abc[n+129]*c1  + abc[n+130]*c2  + abc[n+131]*c3;
+	g2  += abc[n+192]*c0 + abc[n+193]*c1  + abc[n+194]*c2  + abc[n+195]*c3;
+	h00 += abc[n+256]*c0 + abc[n+257]*c1  + abc[n+258]*c2  + abc[n+259]*c3;
+	h01 += abc[n+320]*c0 + abc[n+321]*c1  + abc[n+322]*c2  + abc[n+323]*c3;
+	h02 += abc[n+384]*c0 + abc[n+385]*c1  + abc[n+386]*c2  + abc[n+387]*c3;
+	h11 += abc[n+448]*c0 + abc[n+449]*c1  + abc[n+450]*c2  + abc[n+451]*c3;
+	h12 += abc[n+512]*c0 + abc[n+513]*c1  + abc[n+514]*c2  + abc[n+515]*c3;
+	h22 += abc[n+576]*c0 + abc[n+577]*c1  + abc[n+578]*c2  + abc[n+579]*c3;
+	n += 4;
+	// for (int k=0; k<4; k++) {
+	//   float c  = base[k*strides.z];
+	//   v   += abc[n+  0] * c;
+	//   g0  += abc[n+ 64] * c;
+	//   g1  += abc[n+128] * c;
+	//   g2  += abc[n+192] * c;
+	//   h00 += abc[n+256] * c;
+	//   h01 += abc[n+320] * c;
+	//   h02 += abc[n+384] * c;
+	//   h11 += abc[n+448] * c;
+	//   h12 += abc[n+512] * c;
+	//   h22 += abc[n+576] * c;
+	//   n += 1;
+	// }
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+
+  __shared__ float G[3][3], GGt[3][3];
+  int i0 = threadIdx.x/3;
+  int i1 = threadIdx.x - 3*i0;
+  if (threadIdx.x < 9) 
+    G[i0][i1] = Linv[threadIdx.x];
+  __syncthreads();
+  if (threadIdx.x < 9)   
+    GGt[i0][i1] = (G[0][i0]*G[0][i1] + 
+		   G[1][i0]*G[1][i1] + 
+		   G[2][i0]*G[2][i1]);
+
+  __syncthreads();
+  if (off < 2*N) {
+    // Store gradients back to global memory
+    mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
+    mygrad_lapl[off+2*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
+    mygrad_lapl[off+4*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
+    
+    // Store laplacians back to global memory
+    // Hessian = H00 H01 H02 H11 H12 H22
+    // Matrix = [0 1 2]
+    //          [1 3 4]
+    //          [2 4 5]
+    // laplacian = Trace(GGt*Hessian)
+    mygrad_lapl[off+6*row_stride] =
+      (GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
+       GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
+       GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_c_vgl_cuda 
+(const multi_UBspline_3d_c_cuda *spline, float *pos_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if ((2*spline->num_splines) % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_c_vgl_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (float*)spline->coefs, Linv_d, (float**)vals_d, 
+     (float**)grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_c_vgl_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+/*
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_c_cuda (float *pos, float3 drInv, 
+				     float *coefs_real, float *coefs_imag,
+				     float *vals[], uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+
+  // __shared__ float pos_s[SPLINE_BLOCK_SIZE];
+  // int ir1 = (ir >> 4)*64;
+  // int ir2 = (ir & 15)*4;
+  // pos_s[thr] = pos[ir1+thr];
+  // __syncthreads();
+  // float3 r;
+  // r.x = pos_s[ir2+0];
+  // r.y = pos_s[ir2+1];
+  // r.z = pos_s[ir2+2];
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[4*ir+0];
+    r.y = pos[4*ir+1];
+    r.z = pos[4*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+
+  float val_real = 0.0;
+  float val_imag = 0.0;
+  val_real = val_imag = 0.0;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      for (int k=0; k<4; k++) {
+  	val_real += abc[16*i+4*j+k] * base_real[off+k*strides.z];
+  	val_imag += abc[16*i+4*j+k] * base_imag[off+k*strides.z];
+      }
+    }
+  }
+  __shared__ float buff[2*SPLINE_BLOCK_SIZE];
+  buff[2*thr+0] = val_real;
+  buff[2*thr+1] = val_imag;
+  __syncthreads();
+  myval[off] = buff[thr];
+  myval[off+SPLINE_BLOCK_SIZE] = buff[thr+SPLINE_BLOCK_SIZE];
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_c_vgh_cuda (float *pos, float3 drInv, 
+					 float *coefs_real, float *coefs_imag,
+					 float *vals[], float *grads[], 
+					 float *hess[], uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval, *mygrad, *myhess;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[4*ir+0];
+    r.y = pos[4*ir+1];
+    r.z = pos[4*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[10*(16*i+4*j+k)+0] = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[10*(16*i+4*j+k)+1] = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[10*(16*i+4*j+k)+2] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[10*(16*i+4*j+k)+3] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[10*(16*i+4*j+k)+4] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[10*(16*i+4*j+k)+5] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[10*(16*i+4*j+k)+6] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[10*(16*i+4*j+k)+7] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[10*(16*i+4*j+k)+8] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[10*(16*i+4*j+k)+9] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v_r = 0.0;
+  float v_i = 0.0;
+  float g0_r=0.0, g0_i=0.0, g1_r=0.0, g1_i=0.0, g2_r=0.0, g2_i=0.0, 
+    h00_r=0.0, h00_i=0.0, h01_r=0.0, h01_i=0.0, h02_r=0.0, h02_i=0.0, 
+    h11_r=0.0, h11_i=0.0, h12_r=0.0, h12_i=0.0, h22_r=0.0, h22_i=0.0;
+  int n = 0;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+
+      for (int k=0; k<4; k++) {
+	float cr = base_real[off+k*strides.z];
+	float ci = base_imag[off+k*strides.z];
+	v_r   += abc[n+0] * cr;  v_i   += abc[n+0] * ci;
+	g0_r  += abc[n+1] * cr;  g0_i  += abc[n+1] * ci;
+	g1_r  += abc[n+2] * cr;  g1_i  += abc[n+2] * ci;
+	g2_r  += abc[n+3] * cr;  g2_i  += abc[n+3] * ci;
+	h00_r += abc[n+4] * cr;  h00_i += abc[n+4] * ci;
+	h01_r += abc[n+5] * cr;  h01_i += abc[n+5] * ci;
+	h02_r += abc[n+6] * cr;  h02_i += abc[n+6] * ci;
+	h11_r += abc[n+7] * cr;  h11_i += abc[n+7] * ci;
+	h12_r += abc[n+8] * cr;  h12_i += abc[n+8] * ci;
+	h22_r += abc[n+9] * cr;  h22_i += abc[n+9] * ci; 
+	n += 10;
+      }
+    }
+  }
+  g0_r *= drInv.x; g0_i *= drInv.x;
+  g1_r *= drInv.y; g1_i *= drInv.y;
+  g2_r *= drInv.z; g2_i *= drInv.z;
+
+  h00_r *= drInv.x * drInv.x;  h00_i *= drInv.x * drInv.x;
+  h01_r *= drInv.x * drInv.y;  h01_i *= drInv.x * drInv.y;
+  h02_r *= drInv.x * drInv.z;  h02_i *= drInv.x * drInv.z;
+  h11_r *= drInv.y * drInv.y;  h11_i *= drInv.y * drInv.y;
+  h12_r *= drInv.y * drInv.z;  h12_i *= drInv.y * drInv.z;
+  h22_r *= drInv.z * drInv.z;  h22_i *= drInv.z * drInv.z;
+
+  
+  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+  // Note, we can reuse abc, by replacing buff with abc.
+  
+  buff[2*thr+0] = v_r;  buff[2*thr+1] = v_i;
+  __syncthreads();
+  myval[off] = buff[thr];    
+  myval[off+SPLINE_BLOCK_SIZE] = buff[thr+SPLINE_BLOCK_SIZE];
+
+  buff[6*thr+0] = g0_r;  buff[6*thr+1] = g0_i;
+  buff[6*thr+2] = g1_r;  buff[6*thr+3] = g1_i;
+  buff[6*thr+4] = g2_r;  buff[6*thr+5] = g2_i;
+  __syncthreads();
+  for (int i=0; i<6; i++) 
+    mygrad[(6*block+i)*SPLINE_BLOCK_SIZE+thr] = buff[i*SPLINE_BLOCK_SIZE+thr]; 
+  __syncthreads();
+
+  // Write first half of Hessians
+  if (thr < 32) {
+    buff[12*thr+0]  = h00_r;    buff[12*thr+1]  = h00_i;
+    buff[12*thr+2]  = h01_r;    buff[12*thr+3]  = h01_i;
+    buff[12*thr+4]  = h02_r;    buff[12*thr+5]  = h02_i;
+    buff[12*thr+6]  = h11_r;    buff[12*thr+7]  = h11_i;
+    buff[12*thr+8]  = h12_r;    buff[12*thr+9]  = h12_i;
+    buff[12*thr+10] = h22_r;    buff[12*thr+11] = h22_i;
+  }
+  __syncthreads();
+  if (thr < 32) 
+    for (int i=0; i<6; i++) 
+      myhess[(12*block+i)*SPLINE_BLOCK_SIZE+thr] = buff[i*SPLINE_BLOCK_SIZE+thr];
+
+  __syncthreads();
+  int th2 = thr-32;
+  if (thr >= 32) {
+    buff[12*th2+0]  = h00_r;    buff[12*th2+1]  = h00_i;
+    buff[12*th2+2]  = h01_r;    buff[12*th2+3]  = h01_i;
+    buff[12*th2+4]  = h02_r;    buff[12*th2+5]  = h02_i;
+    buff[12*th2+6]  = h11_r;    buff[12*th2+7]  = h11_i;
+    buff[12*th2+8]  = h12_r;    buff[12*th2+9]  = h12_i;
+    buff[12*th2+10] = h22_r;    buff[12*th2+11] = h22_i;
+  }
+  __syncthreads();
+  if (thr >= 32) {
+    for (int i=0; i<6; i++) 
+      myhess[(12*block+i+6)*SPLINE_BLOCK_SIZE+th2] = buff[i*SPLINE_BLOCK_SIZE+th2];
+  }
+}
+*/
+#endif
--- a/src/einspline/multi_bspline_cuda_d_impl.h
+++ b/src/einspline/multi_bspline_cuda_d_impl.h
@ -0,0 +1,453 @@
+#ifndef MULTI_BSPLINE_CUDA_D_IMPL_H
+#define MULTI_BSPLINE_CUDA_D_IMPL_H
+
+#include "multi_bspline.h"
+#include "multi_bspline_create_cuda.h"
+
+__global__ static void
+eval_multi_multi_UBspline_3d_d_kernel 
+(double *pos, double3 drInv, const double *coefs, double *vals[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ double *myval;
+  __shared__ double abc[64];
+
+  __shared__ double3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  double3 t;
+  double s, sf;
+  double4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0].x =t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
+  tp[1].x =t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
+  tp[2].x =t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
+
+  __shared__ double a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
+    b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
+    c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  if (off < N) {
+    double val = 0.0;
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const double *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+	for (int k=0; k<4; k++) 
+	  val += abc[16*i+4*j+k] * base[off+k*strides.z];
+      }
+    }
+    myval[off] = val;
+  }
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_d_vgh_kernel 
+(double *pos, double3 drInv,  const double *coefs, 
+ double *vals[], double *grads[], double *hess[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ double *myval, *mygrad, *myhess;
+  __shared__ double3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  double3 t;
+  double s, sf;
+  double4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0].x =t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
+  tp[1].x =t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
+  tp[2].x =t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ double a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
+    b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
+    c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ double abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  double v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const double *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  double c  = base[k*strides.z];
+	  v   += abc[n+0] * c;
+	  g0  += abc[n+64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ double buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+  abc[3*thr+0] = g0; 
+  abc[3*thr+1] = g1; 
+  abc[3*thr+2] = g2; 
+  __syncthreads();
+  for (int i=0; i<3; i++) {
+    int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 3*N)
+      mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr]; 
+  }
+  __syncthreads();
+
+  // Write Hessians
+  abc[6*thr+0]  = h00;
+  abc[6*thr+1]  = h01;
+  abc[6*thr+2]  = h02;
+  abc[6*thr+3]  = h11;
+  abc[6*thr+4]  = h12;
+  abc[6*thr+5]  = h22;
+  __syncthreads();
+  for (int i=0; i<6; i++) {
+    int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 6*N)
+      myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_d_cuda (const multi_UBspline_3d_d_cuda *spline,
+				     double *pos_d, double *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+  
+
+  eval_multi_multi_UBspline_3d_d_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
+
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_d_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+}
+
+extern "C" void
+eval_multi_multi_UBspline_3d_d_vgh_cuda (const multi_UBspline_3d_d_cuda *spline,
+					 double *pos_d, double *vals_d[], double *grads_d[],
+					 double *hess_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_d_vgh_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, hess_d,
+     spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_d_vgh_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_d_vgl_kernel 
+(double *pos, double3 drInv,  double *coefs,  double Linv[],
+ double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
+ int N, int row_stride)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ double *myval, *mygrad_lapl;
+  __shared__ double3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad_lapl = grad_lapl[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  double3 t;
+  double s, sf;
+  double4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0].x =t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
+  tp[1].x =t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
+  tp[2].x =t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
+
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ double a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
+    b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
+    c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ double abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  double v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	double *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  double c  = base[k*strides.z];
+	  v   += abc[n+  0] * c;
+	  g0  += abc[n+ 64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ double buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+
+  __shared__ double G[3][3], GGt[3][3];
+  int i0 = threadIdx.x/3;
+  int i1 = threadIdx.x - 3*i0;
+  if (threadIdx.x < 9) 
+    G[i0][i1] = Linv[threadIdx.x];
+  __syncthreads();
+  if (threadIdx.x < 9)   
+    GGt[i0][i1] = (G[0][i0]*G[0][i1] + 
+		   G[1][i0]*G[1][i1] + 
+		   G[2][i0]*G[2][i1]);
+  __syncthreads();
+
+  if (off < N) {
+    // Store gradients back to global memory
+    mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
+    mygrad_lapl[off+1*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
+    mygrad_lapl[off+2*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
+    
+    // Store laplacians back to global memory
+    // Hessian = H00 H01 H02 H11 H12 H22
+    // Matrix = [0 1 2]
+    //          [1 3 4]
+    //          [2 4 5]
+    // laplacian = Trace(GGt*Hessian)
+    mygrad_lapl[off+3*row_stride] = 
+      (GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
+       GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
+       GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_d_vgl_cuda 
+(const multi_UBspline_3d_d_cuda *spline, double *pos_d, double *Linv_d, 
+ double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_d_vgl_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d, 
+     grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_d_vgl_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+#endif
--- a/src/einspline/multi_bspline_cuda_s.cu
+++ b/src/einspline/multi_bspline_cuda_s.cu
@ -0,0 +1,594 @@
+#include "multi_bspline.h"
+#include "multi_bspline_create_cuda.h"
+
+#ifndef NO_CUDA_MAIN
+__constant__ float Acuda[48];
+#endif
+
+// typedef struct
+// {
+//   float *coefs;
+//   uint3 stride;
+//   float3 gridInv;
+//   int num_splines;
+// } multi_UBspline_3d_s_cuda;
+
+#ifndef NO_CUDA_MAIN
+multi_UBspline_3d_s_cuda*
+create_multi_UBspline_3d_s_cuda (multi_UBspline_3d_s* spline)
+{
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  cudaMemcpyToSymbol(Acuda, A_h, 48*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+  multi_UBspline_3d_s_cuda *cuda_spline =
+    (multi_UBspline_3d_s_cuda*) malloc (sizeof (multi_UBspline_3d_s_cuda*));
+  
+  cuda_spline->num_splines = spline->num_splines;
+
+  int Nx = spline->x_grid.num+3;
+  int Ny = spline->y_grid.num+3;
+  int Nz = spline->z_grid.num+3;
+
+  int N = spline->num_splines;
+  if ((N%SPLINE_BLOCK_SIZE) != 0)
+    N += 64 - (N%SPLINE_BLOCK_SIZE);
+  cuda_spline->stride.x = Ny*Nz*N;
+  cuda_spline->stride.y = Nz*N;
+  cuda_spline->stride.z = N;
+
+  cuda_spline->gridInv.x = spline->x_grid.delta_inv;
+  cuda_spline->gridInv.y = spline->y_grid.delta_inv;
+  cuda_spline->gridInv.z = spline->z_grid.delta_inv;
+
+  size_t size = Nx*Ny*Nz*N*sizeof(float);
+
+  cudaMalloc((void**)&(cuda_spline->coefs), size);
+  
+  float *spline_buff = (float*)malloc(size);
+
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++) 
+	for (int isp=0; isp<spline->num_splines; isp++) {
+	  spline_buff[ix*cuda_spline->stride.x +
+		      iy*cuda_spline->stride.y +
+		      iz*cuda_spline->stride.z + isp] =
+	    spline->coefs[ix*spline->x_stride +
+			  iy*spline->y_stride +
+			  iz*spline->z_stride + isp];
+	}
+  cudaMemcpy(cuda_spline->coefs, spline_buff, size, cudaMemcpyHostToDevice);
+
+  //free(spline_buff);
+
+  return cuda_spline;
+}
+#endif
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_kernel 
+(float *pos, float3 drInv, const float *coefs, float *vals[], uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+
+  float val = 0.0;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      for (int k=0; k<4; k++) 
+  	val += abc[16*i+4*j+k] * base[off+k*strides.z];
+    }
+  }
+  myval[off] = val;
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_vgh_kernel 
+(float *pos, float3 drInv, const  float *coefs, 
+ float *vals[], float *grads[], float *hess[], uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad, *myhess;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].z;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].z;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].z;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base = b0 + i*strides.x + j*strides.y;
+      for (int k=0; k<4; k++) {
+	float c  = base[k*strides.z];
+	v   += abc[n+0] * c;
+	g0  += abc[n+1] * c;
+	g1  += abc[n+2] * c;
+	g2  += abc[n+3] * c;
+	h00 += abc[n+4] * c;
+	h01 += abc[n+5] * c;
+	h02 += abc[n+6] * c;
+	h11 += abc[n+7] * c;
+	h12 += abc[n+8] * c;
+	h22 += abc[n+9] * c;
+	n += 10;
+      }
+    }
+  }
+  g0 *= drInv.x; 
+  g1 *= drInv.y; 
+  g2 *= drInv.z; 
+
+  h00 *= drInv.x * drInv.x;  
+  h01 *= drInv.x * drInv.y;  
+  h02 *= drInv.x * drInv.z;  
+  h11 *= drInv.y * drInv.y;  
+  h12 *= drInv.y * drInv.z;  
+  h22 *= drInv.z * drInv.z;  
+
+  
+  //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+  // Note, we can reuse abc, by replacing buff with abc.
+  myval[off] = v;
+  abc[3*thr+0] = g0; 
+  abc[3*thr+1] = g1; 
+  abc[3*thr+2] = g2; 
+  __syncthreads();
+  for (int i=0; i<3; i++) 
+    mygrad[(3*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr]; 
+  __syncthreads();
+
+  // Write first half of Hessians
+  abc[6*thr+0]  = h00;
+  abc[6*thr+1]  = h01;
+  abc[6*thr+2]  = h02;
+  abc[6*thr+3]  = h11;
+  abc[6*thr+4]  = h12;
+  abc[6*thr+5]  = h22;
+  __syncthreads();
+  for (int i=0; i<6; i++) 
+    myhess[(6*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr];
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_cuda (const multi_UBspline_3d_s_cuda *spline,
+				     float *pos_d, float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, spline->stride);
+}
+
+
+
+
+void
+test_multi_cuda2()
+{
+  int numWalkers = 1000;
+  float *vals[numWalkers], *grads[numWalkers], *hess[numWalkers];
+  float *coefs, __device__ **vals_d, **grads_d, **hess_d;
+  float *r_d, *r_h;
+  int xs, ys, zs, N;
+  int Nx, Ny, Nz;
+
+  N = 128;
+  Nx = Ny = Nz = 32;
+  xs = Ny*Nz*N;
+  ys = Nz*N;
+  zs = N;
+
+  // Setup Bspline coefficients
+  int size = Nx*Ny*Nz*N*sizeof(float);
+  posix_memalign((void**)&coefs, 16, size);
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++)
+	for (int n=0; n<N; n++)
+	  coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
+
+  Ugrid x_grid, y_grid, z_grid;
+  x_grid.start = 0.0; x_grid.end = 1.0; x_grid.num = Nx;
+  y_grid.start = 0.0; y_grid.end = 1.0; y_grid.num = Ny;
+  z_grid.start = 0.0; z_grid.end = 1.0; z_grid.num = Nz;
+  BCtype_s xBC, yBC, zBC;
+  xBC.lCode = xBC.rCode = PERIODIC;
+  yBC.lCode = yBC.rCode = PERIODIC;
+  zBC.lCode = zBC.rCode = PERIODIC;
+  
+
+  multi_UBspline_3d_s *spline = 
+    create_multi_UBspline_3d_s (x_grid, y_grid, z_grid, xBC, yBC, zBC, N);
+  for (int i=0; i<N; i++) 
+    set_multi_UBspline_3d_s (spline, i, coefs);
+
+  multi_UBspline_3d_s_cuda *cudaspline = 
+    create_multi_UBspline_3d_s_cuda (spline);
+
+  // Setup device value storage
+  int numVals = N*numWalkers*10;
+  float *valBlock_d, *valBlock_h;
+  cudaMalloc((void**)&(valBlock_d),     numVals*sizeof(float));
+  cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float));
+  cudaMalloc((void**)&(vals_d),  numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(grads_d), numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(hess_d),  numWalkers*sizeof(float*));
+  fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
+  for (int i=0; i<numWalkers; i++) {
+    vals[i]  = valBlock_d + i*N;
+    grads[i] = valBlock_d + N*numWalkers + 3*i*N;
+    hess[i]  = valBlock_d + 4*N*numWalkers + 6*i*N;
+  }
+  cudaMemcpy(vals_d,  vals,  numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(grads_d, grads, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(hess_d,  hess,  numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  fprintf (stderr, "Finished cuda allocations.\n");
+
+  // Setup walker positions
+  cudaMalloc((void**)&(r_d),     3*numWalkers*sizeof(float));
+  cudaMallocHost((void**)&(r_h), 3*numWalkers*sizeof(float));
+
+  for (int ir=0; ir<numWalkers; ir++) {
+    r_h[3*ir+0] = 0.5*drand48();
+    r_h[3*ir+1] = 0.5*drand48();
+    r_h[3*ir+2] = 0.5*drand48();
+  }
+
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(N/SPLINE_BLOCK_SIZE,numWalkers);
+  
+  float vals_host[N], vals_cuda[N];
+
+  // Check value
+  for (int w=0; w<numWalkers; w++) {
+    eval_multi_UBspline_3d_s (spline, r_h[3*w+0], r_h[3*w+1], r_h[3*w+2], vals_host);
+    cudaMemcpy(r_d, r_h, 3*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
+    eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>> 
+      (r_d, cudaspline->gridInv, cudaspline->coefs, vals_d, cudaspline->stride);
+    cudaMemcpy(vals_cuda, valBlock_d+(N*w), N*sizeof(float), cudaMemcpyDeviceToHost);
+    
+    //for (int i=0; i<N; i++)
+      fprintf (stderr, "%3i  %15.8e %15.8e\n", w, vals_host[0], vals_cuda[0]);
+  }
+
+
+  clock_t start, end;
+  start = clock();
+  for (int i=0; i<10000; i++) {
+    if ((i%1000) == 0) 
+      fprintf (stderr, "i = %d\n", i);
+    cudaMemcpy(r_d, r_h, 3*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
+    eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>> 
+       (r_d, cudaspline->gridInv, cudaspline->coefs, vals_d, cudaspline->stride);
+  }
+  end = clock();
+  double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
+  fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
+
+  start = clock();
+  for (int i=0; i<10000; i++) {
+    if ((i%1000) == 0) 
+      fprintf (stderr, "i = %d\n", i);
+    cudaMemcpy(r_d, r_h, 3*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
+    eval_multi_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>> 
+       (r_d, cudaspline->gridInv, cudaspline->coefs, vals_d, grads_d, hess_d, cudaspline->stride);
+  }
+  end = clock();
+  time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
+  fprintf (stderr, "VGH Evals per second = %1.8e\n", 1.0/time);
+  
+  cudaFree (spline->coefs);
+  cudaFree (valBlock_d);
+  cudaFree (vals_d);
+  cudaFree (grads_d);
+  cudaFree (hess_d);
+  cudaFree (r_d);
+}
+				    
+
+
+static void *
+test_multi_cuda(void *thread)
+{
+  cudaSetDevice((int)(size_t)thread);
+  fprintf (stderr, "In thread %p\n", thread);
+
+  int numWalkers = 1000;
+  float *coefs  ,  __device__ *vals[numWalkers], *grads[numWalkers], *hess[numWalkers];
+  float *coefs_d, __device__ **vals_d, **grads_d, **hess_d;
+  float A_h[48] = { -1.0/6.0,  3.0/6.0, -3.0/6.0, 1.0/6.0,
+		     3.0/6.0, -6.0/6.0,  0.0/6.0, 4.0/6.0,
+		    -3.0/6.0,  3.0/6.0,  3.0/6.0, 1.0/6.0,
+		     1.0/6.0,  0.0/6.0,  0.0/6.0, 0.0/6.0,
+		         0.0,     -0.5,      1.0,    -0.5,
+  		         0.0,      1.5,     -2.0,     0.0,
+		         0.0,     -1.5,      1.0,     0.5,
+		         0.0,      0.5,      0.0,     0.0,
+		         0.0,      0.0,     -1.0,     1.0,
+		         0.0,      0.0,      3.0,    -2.0,
+		         0.0,      0.0,     -3.0,     1.0,
+		         0.0,      0.0,      1.0,     0.0 };
+
+  // Copy A to host
+  cudaMemcpy(Acuda, A_h, 48*sizeof(float), cudaMemcpyHostToDevice); 
+
+  float *r_d, *r_h;
+  int xs, ys, zs, N;
+  int Nx, Ny, Nz;
+
+  N = 128;
+  Nx = Ny = Nz = 32;
+  xs = Ny*Nz*N;
+  ys = Nz*N;
+  zs = N;
+
+  float3 drInv;
+  drInv.x = 1.0/float(Nx);
+  drInv.y = 1.0/float(Ny);
+  drInv.z = 1.0/float(Nz);
+
+  // Setup Bspline coefficients
+  int size = Nx*Ny*Nz*N*sizeof(float);
+  posix_memalign((void**)&coefs, 16, size);
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++)
+	for (int n=0; n<N; n++)
+	  coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
+
+
+  fprintf (stderr, "Filled in coefs.\n");
+  fprintf (stderr, "size = %d\n", size);
+  
+  // Setup CUDA coefficients
+  cudaMalloc((void**)&coefs_d, 2*size);
+  cudaMemcpy(coefs_d, coefs, size, cudaMemcpyHostToDevice);
+
+  // Setup device value storage
+  int numVals = N*numWalkers*10;
+  float *valBlock_d, *valBlock_h;
+  cudaMalloc((void**)&(valBlock_d),     numVals*sizeof(float));
+  cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float));
+  cudaMalloc((void**)&(vals_d),  numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(grads_d), numWalkers*sizeof(float*));
+  cudaMalloc((void**)&(hess_d),  numWalkers*sizeof(float*));
+  fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
+  for (int i=0; i<numWalkers; i++) {
+    vals[i]  = valBlock_d + i*N;
+    grads[i] = valBlock_d + N*numWalkers + 3*i*N;
+    hess[i]  = valBlock_d + 4*N*numWalkers + 6*i*N;
+  }
+  cudaMemcpy(vals_d,  vals,  numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(grads_d, grads, numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(hess_d,  hess,  numWalkers*sizeof(float*), cudaMemcpyHostToDevice);
+  
+  fprintf (stderr, "Finished cuda allocations.\n");
+
+  // Setup walker positions
+  cudaMalloc((void**)&(r_d),     4*numWalkers*sizeof(float));
+  cudaMallocHost((void**)&(r_h), 4*numWalkers*sizeof(float));
+
+  for (int ir=0; ir<numWalkers; ir++) {
+    r_h[4*ir+0] = 0.5*drand48();
+    r_h[4*ir+1] = 0.5*drand48();
+    r_h[4*ir+2] = 0.5*drand48();
+  }
+
+  uint3 strides;
+  strides.x = xs;
+  strides.y = ys;
+  strides.z = zs;
+
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(N/SPLINE_BLOCK_SIZE,numWalkers);
+  
+  clock_t start, end;
+
+  start = clock();
+  for (int i=0; i<10000; i++) {
+    if ((i%1000) == 0) 
+      fprintf (stderr, "i = %d\n", i);
+    cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
+    eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>> 
+       (r_d, drInv, coefs_d, vals_d, strides);
+  }
+  end = clock();
+  double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
+  fprintf (stderr, "VGH evals per second = %1.8e\n", 1.0/time);
+
+  start = clock();
+  for (int i=0; i<10000; i++) {
+    if ((i%1000) == 0) 
+      fprintf (stderr, "i = %d\n", i);
+    cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice);
+    eval_multi_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>> 
+       (r_d, drInv, coefs_d, vals_d, grads_d, hess_d, strides);
+  }
+  end = clock();
+  time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
+  fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
+  
+  // cudaFree (valBlock_d);
+  // cudaFree (vals_d);
+  // cudaFree (coefs_d);
+  // cudaFree (r_d);
+
+  return NULL;
+
+}
+
+
+#ifndef NO_CUDA_MAIN
+
+main()
+{
+  int deviceCount;
+  cudaGetDeviceCount(&deviceCount);
+  fprintf (stderr, "Detected %d CUDA devices.\n", deviceCount);
+
+  // test_cuda();
+
+  for (int device = 0; device < deviceCount; ++device) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, device);
+    fprintf (stderr, "Device %d:\n", device);
+    fprintf (stderr, "  Global memory:     %10d\n",
+	     deviceProp.totalGlobalMem);
+    fprintf (stderr, "  MultiProcessors:   %10d\n",
+	     deviceProp.multiProcessorCount);
+    fprintf (stderr, "  Registers:         %10d\n", 
+	     deviceProp.regsPerBlock);
+    fprintf (stderr, "  Constant memory:   %10d\n", 
+	     deviceProp.totalConstMem);
+    fprintf (stderr, "  Shared memory:     %10d\n", 
+	     deviceProp.sharedMemPerBlock);
+    fprintf (stderr, "  Clock rate:        %10d\n", 
+	     deviceProp.clockRate);
+
+  }
+
+  //  test_multi_cuda((void*)0);
+  test_multi_cuda2();
+  fprintf (stderr, "After frees.\n");
+}
+
+#endif
--- a/src/einspline/multi_bspline_cuda_s.h
+++ b/src/einspline/multi_bspline_cuda_s.h
@ -0,0 +1,216 @@
+#ifndef MULTI_BSPLINE_CUDA_S_H
+#define MULTI_BSPLINE_CUDA_S_H
+
+#include "multi_bspline_structs_cuda.h"
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_cuda (float *pos, float3 drInv, 
+				     const float *coefs, float *vals[], uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[4*ir+0];
+    r.y = pos[4*ir+1];
+    r.z = pos[4*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = A[4*thr+0]*tp[0].x + A[4*thr+1]*tp[0].y + A[4*thr+2]*tp[0].z + A[4*thr+3]*tp[0].w;
+    b[thr] = A[4*thr+0]*tp[1].x + A[4*thr+1]*tp[1].y + A[4*thr+2]*tp[1].z + A[4*thr+3]*tp[1].w;
+    c[thr] = A[4*thr+0]*tp[2].x + A[4*thr+1]*tp[2].y + A[4*thr+2]*tp[2].z + A[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+
+  float val = 0.0;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      for (int k=0; k<4; k++) 
+  	val += abc[16*i+4*j+k] * base[off+k*strides.z];
+    }
+  }
+  myval[off] = val;
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_vgh_cuda (float *pos, float3 drInv,  const float *coefs, 
+					 float *vals[], float *grads[], float *hess[],
+					 uint3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad, *myhess;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[4*ir+0];
+    r.y = pos[4*ir+1];
+    r.z = pos[4*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = A[4*thr+0]*tp[0].x + A[4*thr+1]*tp[0].y + A[4*thr+2]*tp[0].z + A[4*thr+3]*tp[0].z;
+    b[thr] = A[4*thr+0]*tp[1].x + A[4*thr+1]*tp[1].y + A[4*thr+2]*tp[1].z + A[4*thr+3]*tp[1].z;
+    c[thr] = A[4*thr+0]*tp[2].x + A[4*thr+1]*tp[2].y + A[4*thr+2]*tp[2].z + A[4*thr+3]*tp[2].z;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base = b0 + i*strides.x + j*strides.y;
+      for (int k=0; k<4; k++) {
+	float c  = base[k*strides.z];
+	v   += abc[n+0] * c;
+	g0  += abc[n+1] * c;
+	g1  += abc[n+2] * c;
+	g2  += abc[n+3] * c;
+	h00 += abc[n+4] * c;
+	h01 += abc[n+5] * c;
+	h02 += abc[n+6] * c;
+	h11 += abc[n+7] * c;
+	h12 += abc[n+8] * c;
+	h22 += abc[n+9] * c;
+	n += 10;
+      }
+    }
+  }
+  g0 *= drInv.x; 
+  g1 *= drInv.y; 
+  g2 *= drInv.z; 
+
+  h00 *= drInv.x * drInv.x;  
+  h01 *= drInv.x * drInv.y;  
+  h02 *= drInv.x * drInv.z;  
+  h11 *= drInv.y * drInv.y;  
+  h12 *= drInv.y * drInv.z;  
+  h22 *= drInv.z * drInv.z;  
+
+  
+  //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+  // Note, we can reuse abc, by replacing buff with abc.
+  myval[off] = v;
+  abc[3*thr+0] = g0; 
+  abc[3*thr+1] = g1; 
+  abc[3*thr+2] = g2; 
+  __syncthreads();
+  for (int i=0; i<3; i++) 
+    mygrad[(3*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr]; 
+  __syncthreads();
+
+  // Write first half of Hessians
+  abc[6*thr+0]  = h00;
+  abc[6*thr+1]  = h01;
+  abc[6*thr+2]  = h02;
+  abc[6*thr+3]  = h11;
+  abc[6*thr+4]  = h12;
+  abc[6*thr+5]  = h22;
+  __syncthreads();
+  for (int i=0; i<6; i++) 
+    myhess[(6*block+i)*SPLINE_BLOCK_SIZE+thr] = abc[i*SPLINE_BLOCK_SIZE+thr];
+}
+
+
+#endif
--- a/src/einspline/multi_bspline_cuda_s_impl.h
+++ b/src/einspline/multi_bspline_cuda_s_impl.h
@ -0,0 +1,938 @@
+#ifndef MULTI_BSPLINE_CUDA_S_IMPL_H
+#define MULTI_BSPLINE_CUDA_S_IMPL_H
+
+//#include <stdio.h>
+#include "multi_bspline.h"
+#include "multi_bspline_create_cuda.h"
+
+
+__global__ static void
+eval_multi_multi_UBspline_1d_s_kernel
+(float *pos, float drInv, const float *coefs, float **vals, 
+ uint dim, uint stride, int N)
+{
+  int tid   = threadIdx.x;
+  int ir    = blockIdx.x;
+
+  __shared__ float *ourval;
+  __shared__ float r;
+  if (tid == 0) {
+    r = pos[ir];
+    ourval = vals[ir];
+  }
+  __syncthreads();
+  
+  int index;
+  float t;
+  float s, sf;
+  float4 tp;
+
+  s = r * drInv;
+  sf = floor(s);
+  index = min(max(0,(int)sf), dim-1);
+  t = s - sf;
+  tp = make_float4(t*t*t, t*t, t, 1.0);
+
+  __shared__ float a[4];
+  if (tid < 4) 
+    a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
+  __syncthreads();
+
+  int numBlocks = N / SPLINE_BLOCK_SIZE;
+  const float *c = coefs + index*stride + tid;
+  float *myval = ourval + tid;
+  int stride2 = 2*stride;
+  int stride3 = 3*stride;
+  for (int block=0; block < numBlocks; block++) {
+     *myval = (a[0] * c[0] +
+	       a[1] * c[stride] +
+	       a[2] * c[stride2] +
+	       a[3] * c[stride3]);
+     myval += SPLINE_BLOCK_SIZE;    c += SPLINE_BLOCK_SIZE;
+  }
+      
+  int remainder = N - numBlocks*SPLINE_BLOCK_SIZE;
+  if (tid < remainder) {
+    *myval = (a[0] * c[0] +
+	      a[1] * c[stride] +
+	      a[2] * c[stride2] +
+	      a[3] * c[stride3]);
+  }
+}
+
+extern "C" void
+eval_multi_multi_UBspline_1d_s_cuda (const multi_UBspline_1d_s_cuda *spline,
+				     float *pos_d, float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(num);
+
+  eval_multi_multi_UBspline_1d_s_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_s_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_1d_s_vgl_kernel
+(float *pos, float drInv, const float *coefs, float **vals, 
+float **grads, float **lapl, 
+ uint dim, uint stride, int N)
+{
+  int tid   = threadIdx.x;
+  int ir    = blockIdx.x;
+
+  __shared__ float *ourval, *ourgrad, *ourlapl;
+  __shared__ float r;
+  if (tid == 0) {
+    r = pos[ir];
+    ourval = vals[ir];
+    ourgrad = grads[ir];
+    ourlapl = lapl[ir];
+  }
+  __syncthreads();
+  
+  int index;
+  float t;
+  float s, sf;
+  float4 tp;
+
+  s = r * drInv;
+  sf = floor(s);
+  index = min(max(0,(int)sf), dim-1);
+  t = s - sf;
+  tp = make_float4(t*t*t, t*t, t, 1.0);
+
+  __shared__ float a[12];
+  if (tid < 12) 
+    a[tid] = Acuda[4*tid+0]*tp.x + Acuda[4*tid+1]*tp.y + Acuda[4*tid+2]*tp.z + Acuda[4*tid+3]*tp.w;
+  __syncthreads();
+
+  int numBlocks = N / SPLINE_BLOCK_SIZE;
+  const float *c = coefs + index*stride + tid;
+  float *myval  = ourval + tid;
+  float *mygrad = ourgrad + tid;
+  float *mylapl = ourlapl + tid;
+  int stride2 = 2*stride;
+  int stride3 = 3*stride;
+  __shared__ float coef[SPLINE_BLOCK_SIZE][5];
+  for (int block=0; block < numBlocks; block++) {
+    coef[tid][0] = c[0];
+    coef[tid][1] = c[stride];
+    coef[tid][2] = c[stride2];
+    coef[tid][3] = c[stride3];
+    *myval = (a[0] * coef[tid][0]   + a[1] * coef[tid][1] +
+	      a[2] * coef[tid][2]   + a[3] * coef[tid][3]);
+    *mygrad = (a[4] * coef[tid][0]  + a[5] * coef[tid][1] +
+	       a[6] * coef[tid][2]  + a[7] * coef[tid][3]);
+    *mylapl = (a[8] * coef[tid][0]  + a[9] * coef[tid][1] +
+	       a[10] * coef[tid][2] + a[11]* coef[tid][3]);
+    myval  += SPLINE_BLOCK_SIZE;    
+    mygrad += SPLINE_BLOCK_SIZE;    
+    mylapl += SPLINE_BLOCK_SIZE;    
+    c += SPLINE_BLOCK_SIZE;
+  }
+      
+  int remainder = N - numBlocks*SPLINE_BLOCK_SIZE;
+  if (tid < remainder) {
+    *myval = (a[0] * c[0] +
+	      a[1] * c[stride] +
+	      a[2] * c[stride2] +
+	      a[3] * c[stride3]);
+  }
+}
+
+extern "C" void
+eval_multi_multi_UBspline_1d_s_vgl_cuda (const multi_UBspline_1d_s_cuda *spline,
+					 float *pos_d, float *vals_d[], 
+					 float *grads_d[], float *lapl_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(num);
+
+  eval_multi_multi_UBspline_1d_s_vgl_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, lapl_d,
+     spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_1d_s_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_kernel 
+(float *pos, float3 drInv, const float *coefs, float *vals[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  //index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  //index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  //index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  if (off < N) {
+    float val = 0.0;
+    for (unsigned i=0; i<4; i++) {
+      const float *base = coefs + (index.x+i)*strides.x + (index.y)*strides.y + index.z*strides.z + off;
+      for (unsigned j=0; j<4; j++) {
+	for (unsigned k=0; k<4; k++) 
+	  val += abc[16*i+4*j+k] * base[k*strides.z];
+	base += strides.y;
+      }
+    }
+    myval[off] = val;
+  }
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_sign_kernel 
+(float *pos, float *sign, float3 drInv, const float *coefs, float *vals[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ float *myval;
+  __shared__ float abc[64];
+  __shared__ float mysign;
+
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+    mysign = sign[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  //index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  //index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  //index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  if (off < N) {
+    float val = 0.0;
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const float *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+	for (int k=0; k<4; k++) 
+	  val += abc[16*i+4*j+k] * base[off+k*strides.z];
+      }
+    }
+    myval[off] = mysign*val;
+  }
+}
+
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_vgh_kernel 
+(float *pos, float3 drInv,  const float *coefs, 
+ float *vals[], float *grads[], float *hess[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad, *myhess;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < N) {
+    for (unsigned i=0; i<4; i++) {
+      for (unsigned j=0; j<4; j++) {
+	const float *base = b0 + i*strides.x + j*strides.y;
+	float c0  = base[0*strides.z];
+	float c1  = base[1*strides.z];
+	float c2  = base[2*strides.z];
+	float c3  = base[3*strides.z];
+	v   += abc[n+  0]*c0 + abc[n+  1]*c1  + abc[n+  2]*c2  + abc[n+  3]*c3;
+	g0  += abc[n+ 64]*c0 + abc[n+ 65]*c1  + abc[n+ 66]*c2  + abc[n+ 67]*c3;
+	g1  += abc[n+128]*c0 + abc[n+129]*c1  + abc[n+130]*c2  + abc[n+131]*c3;
+	g2  += abc[n+192]*c0 + abc[n+193]*c1  + abc[n+194]*c2  + abc[n+195]*c3;
+	h00 += abc[n+256]*c0 + abc[n+257]*c1  + abc[n+258]*c2  + abc[n+259]*c3;
+	h01 += abc[n+320]*c0 + abc[n+321]*c1  + abc[n+322]*c2  + abc[n+323]*c3;
+	h02 += abc[n+384]*c0 + abc[n+385]*c1  + abc[n+386]*c2  + abc[n+387]*c3;
+	h11 += abc[n+448]*c0 + abc[n+449]*c1  + abc[n+450]*c2  + abc[n+451]*c3;
+	h12 += abc[n+512]*c0 + abc[n+513]*c1  + abc[n+514]*c2  + abc[n+515]*c3;
+	h22 += abc[n+576]*c0 + abc[n+577]*c1  + abc[n+578]*c2  + abc[n+579]*c3;
+	n += 4;
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+  abc[3*thr+0] = g0; 
+  abc[3*thr+1] = g1; 
+  abc[3*thr+2] = g2; 
+  __syncthreads();
+  for (int i=0; i<3; i++) {
+    int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 3*N)
+      mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr]; 
+  }
+  __syncthreads();
+
+  // Write Hessians
+  abc[6*thr+0]  = h00;
+  abc[6*thr+1]  = h01;
+  abc[6*thr+2]  = h02;
+  abc[6*thr+3]  = h11;
+  abc[6*thr+4]  = h12;
+  abc[6*thr+5]  = h22;
+  __syncthreads();
+  for (int i=0; i<6; i++) {
+    int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 6*N)
+      myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_cuda (const multi_UBspline_3d_s_cuda *spline,
+				     float *pos_d, float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+  
+
+  eval_multi_multi_UBspline_3d_s_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, spline->dim, spline->stride, spline->num_splines);
+
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+}
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_sign_cuda (const multi_UBspline_3d_s_cuda *spline,
+					  float *pos_d, float *sign_d, 
+					  float *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+  
+
+  eval_multi_multi_UBspline_3d_s_sign_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, sign_d, spline->gridInv, spline->coefs, 
+     vals_d, spline->dim, spline->stride, spline->num_splines);
+
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+}
+
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_vgh_cuda (const multi_UBspline_3d_s_cuda *spline,
+					 float *pos_d, float *vals_d[], float *grads_d[],
+					 float *hess_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_s_vgh_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, vals_d, grads_d, hess_d,
+     spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_vgh_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_vgl_kernel 
+(float *pos, float3 drInv,  const float *coefs,  float Linv[],
+ float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
+ int N, int row_stride)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad_lapl;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad_lapl = grad_lapl[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+
+  if (off < N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const float *base = b0 + i*strides.x + j*strides.y;
+	float c0  = base[0*strides.z];
+	float c1  = base[1*strides.z];
+	float c2  = base[2*strides.z];
+	float c3  = base[3*strides.z];
+	v   += abc[n+  0]*c0 + abc[n+  1]*c1  + abc[n+  2]*c2  + abc[n+  3]*c3;
+	g0  += abc[n+ 64]*c0 + abc[n+ 65]*c1  + abc[n+ 66]*c2  + abc[n+ 67]*c3;
+	g1  += abc[n+128]*c0 + abc[n+129]*c1  + abc[n+130]*c2  + abc[n+131]*c3;
+	g2  += abc[n+192]*c0 + abc[n+193]*c1  + abc[n+194]*c2  + abc[n+195]*c3;
+	h00 += abc[n+256]*c0 + abc[n+257]*c1  + abc[n+258]*c2  + abc[n+259]*c3;
+	h01 += abc[n+320]*c0 + abc[n+321]*c1  + abc[n+322]*c2  + abc[n+323]*c3;
+	h02 += abc[n+384]*c0 + abc[n+385]*c1  + abc[n+386]*c2  + abc[n+387]*c3;
+	h11 += abc[n+448]*c0 + abc[n+449]*c1  + abc[n+450]*c2  + abc[n+451]*c3;
+	h12 += abc[n+512]*c0 + abc[n+513]*c1  + abc[n+514]*c2  + abc[n+515]*c3;
+	h22 += abc[n+576]*c0 + abc[n+577]*c1  + abc[n+578]*c2  + abc[n+579]*c3;
+	n += 4;
+      }
+    }
+    
+  // if (off < N) {
+  //   for (int i=0; i<4; i++) {
+  //     for (int j=0; j<4; j++) {
+  // 	float *base = b0 + i*strides.x + j*strides.y;
+  // 	for (int k=0; k<4; k++) {
+  // 	  float c  = base[k*strides.z];
+  // 	  v   += abc[n+  0] * c;
+  // 	  g0  += abc[n+ 64] * c;
+  // 	  g1  += abc[n+128] * c;
+  // 	  g2  += abc[n+192] * c;
+  // 	  h00 += abc[n+256] * c;
+  // 	  h01 += abc[n+320] * c;
+  // 	  h02 += abc[n+384] * c;
+  // 	  h11 += abc[n+448] * c;
+  // 	  h12 += abc[n+512] * c;
+  // 	  h22 += abc[n+576] * c;
+  // 	  n += 1;
+  // 	}
+  //     }
+  //   }
+
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+    
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+  
+  __shared__ float G[3][3], GGt[3][3];
+  int i0 = threadIdx.x/3;
+  int i1 = threadIdx.x - 3*i0;
+  if (threadIdx.x < 9) 
+    G[i0][i1] = Linv[threadIdx.x];
+  __syncthreads();
+  if (threadIdx.x < 9)   
+    GGt[i0][i1] = (G[0][i0]*G[0][i1] + 
+		   G[1][i0]*G[1][i1] + 
+		   G[2][i0]*G[2][i1]);
+  __syncthreads();
+  if (off < N) {
+    // Store gradients back to global memory
+    mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
+    mygrad_lapl[off+1*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
+    mygrad_lapl[off+2*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
+    
+    // Store laplacians back to global memory
+    // Hessian = H00 H01 H02 H11 H12 H22
+    // Matrix = [0 1 2]
+    //          [1 3 4]
+    //          [2 4 5]
+    // laplacian = Trace(GGt*Hessian)
+    mygrad_lapl[off+3*row_stride] = 
+      (GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
+       GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
+       GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_vgl_cuda 
+(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_s_vgl_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, spline->coefs, Linv_d, vals_d, 
+     grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_vgl_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_s_vgl_sign_kernel 
+(float *pos, float sign[], float3 drInv,  const float *coefs, float Linv[],
+   float *vals[], float *grad_lapl[], uint3 dim, uint3 strides,
+   int N, int row_stride)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ float *myval, *mygrad_lapl, mysign;
+  __shared__ float3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad_lapl = grad_lapl[ir];
+    mysign = sign[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0] = make_float4(t.x*t.x*t.x, t.x*t.x, t.x, 1.0);
+  tp[1] = make_float4(t.y*t.y*t.y, t.y*t.y, t.y, 1.0);
+  tp[2] = make_float4(t.z*t.z*t.z, t.z*t.z, t.z, 1.0);
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ float a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Acuda[4*thr+0]*tp[0].x + Acuda[4*thr+1]*tp[0].y + Acuda[4*thr+2]*tp[0].z + Acuda[4*thr+3]*tp[0].w;
+    b[thr] = Acuda[4*thr+0]*tp[1].x + Acuda[4*thr+1]*tp[1].y + Acuda[4*thr+2]*tp[1].z + Acuda[4*thr+3]*tp[1].w;
+    c[thr] = Acuda[4*thr+0]*tp[2].x + Acuda[4*thr+1]*tp[2].y + Acuda[4*thr+2]*tp[2].z + Acuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ float abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  float v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const float *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const float *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  float c  = base[k*strides.z];
+	  v   += abc[n+  0] * c;
+	  g0  += abc[n+ 64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ float buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = mysign * v;
+  }
+
+  __shared__ float G[3][3], GGt[3][3];
+  int i0 = threadIdx.x/3;
+  int i1 = threadIdx.x - 3*i0;
+  if (threadIdx.x < 9) 
+    G[i0][i1] = Linv[threadIdx.x];
+  __syncthreads();
+  if (threadIdx.x < 9)   
+    GGt[i0][i1] = (G[0][i0]*G[0][i1] + 
+		   G[1][i0]*G[1][i1] + 
+		   G[2][i0]*G[2][i1]);
+  __syncthreads();
+  if (off < N) {
+    // Store gradients back to global memory
+    mygrad_lapl[off+0*row_stride] = mysign*(G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2);
+    mygrad_lapl[off+1*row_stride] = mysign*(G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2);
+    mygrad_lapl[off+2*row_stride] = mysign*(G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2);
+    
+    // Store laplacians back to global memory
+    // Hessian = H00 H01 H02 H11 H12 H22
+    // Matrix = [0 1 2]
+    //          [1 3 4]
+    //          [2 4 5]
+    // laplacian = Trace(GGt*Hessian)
+    mygrad_lapl[off+3*row_stride] = mysign * 
+      (GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
+       GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
+       GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_vgl_sign_cuda 
+(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_s_vgl_sign_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, sign_d, spline->gridInv, spline->coefs, Linv_d, vals_d, 
+     grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_s_vgl_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+
+#endif
--- a/src/einspline/multi_bspline_cuda_z_impl.h
+++ b/src/einspline/multi_bspline_cuda_z_impl.h
@ -0,0 +1,461 @@
+#ifndef MULTI_BSPLINE_CUDA_Z_IMPL_H
+#define MULTI_BSPLINE_CUDA_Z_IMPL_H
+
+#include "multi_bspline.h"
+#include "multi_bspline_create_cuda.h"
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_z_kernel 
+(double *pos, double3 drInv, const double *coefs, double *vals[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+thr;
+
+  __shared__ double *myval;
+  __shared__ double abc[64];
+
+  __shared__ double3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval = vals[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  double3 t;
+  double s, sf;
+  double4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  //index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  //index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  //index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0].x=t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
+  tp[1].x=t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
+  tp[2].x=t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
+
+  __shared__ double a[4], b[4], c[4];
+  if (thr < 4) {
+    a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
+    b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
+    c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  if (thr < 64)
+    abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  if (off < 2*N) {
+    double val = 0.0;
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const double *base = coefs + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+	for (int k=0; k<4; k++) 
+	  val += abc[16*i+4*j+k] * base[off+k*strides.z];
+      }
+    }
+    myval[off] = val;
+  }
+}
+
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_z_vgh_kernel 
+(double *pos, double3 drInv, const double *coefs, 
+ double *vals[], double *grads[], double *hess[], 
+ uint3 dim, uint3 strides, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ double *myval, *mygrad, *myhess;
+  __shared__ double3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad = grads[ir];
+    myhess = hess[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  double3 t;
+  double s, sf;
+  double4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0].x=t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
+  tp[1].x=t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
+  tp[2].x=t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ double a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
+    b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
+    c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ double abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  double v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < 2*N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const double *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  double c  = base[k*strides.z];
+	  v   += abc[n+0] * c;
+	  g0  += abc[n+64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ double buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+  abc[3*thr+0] = g0; 
+  abc[3*thr+1] = g1; 
+  abc[3*thr+2] = g2; 
+  __syncthreads();
+  for (int i=0; i<3; i++) {
+    int myoff = (3*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 3*N)
+      mygrad[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr]; 
+  }
+  __syncthreads();
+
+  // Write Hessians
+  abc[6*thr+0]  = h00;
+  abc[6*thr+1]  = h01;
+  abc[6*thr+2]  = h02;
+  abc[6*thr+3]  = h11;
+  abc[6*thr+4]  = h12;
+  abc[6*thr+5]  = h22;
+  __syncthreads();
+  for (int i=0; i<6; i++) {
+    int myoff = (6*block+i)*SPLINE_BLOCK_SIZE + thr;
+    if (myoff < 12*N)
+      myhess[myoff] = abc[i*SPLINE_BLOCK_SIZE+thr];
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_z_cuda (const multi_UBspline_3d_z_cuda *spline,
+				     double *pos_d, double *vals_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+  
+
+  eval_multi_multi_UBspline_3d_z_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (double*)spline->coefs, (double**)vals_d, 
+     spline->dim, spline->stride, spline->num_splines);
+
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_z_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+
+}
+
+extern "C" void
+eval_multi_multi_UBspline_3d_z_vgh_cuda (const multi_UBspline_3d_z_cuda *spline,
+					 double *pos_d, complex_double *vals_d[], complex_double *grads_d[],
+					 complex_double *hess_d[], int num)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_z_vgh_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (double*)spline->coefs, 
+     (double**)vals_d, (double**)grads_d, (double**)hess_d,
+     spline->dim, spline->stride, spline->num_splines);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_z_vgh_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+__global__ static void
+eval_multi_multi_UBspline_3d_z_vgl_kernel 
+(double *pos, double3 drInv,  const double *coefs,  double Linv[],
+ double *vals[], double *grad_lapl[], uint3 dim, uint3 strides,
+ int N, int row_stride)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int off   = block*SPLINE_BLOCK_SIZE+threadIdx.x;
+
+  __shared__ double *myval, *mygrad_lapl;
+  __shared__ double3 r;
+  if (thr == 0) {
+    r.x = pos[3*ir+0];
+    r.y = pos[3*ir+1];
+    r.z = pos[3*ir+2];
+    myval  = vals[ir];
+    mygrad_lapl = grad_lapl[ir];
+  }
+  __syncthreads();
+  
+  int3 index;
+  double3 t;
+  double s, sf;
+  double4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = min(max(0,(int)sf), dim.x-1);
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = min(max(0,(int)sf), dim.y-1);
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = min(max(0,(int)sf), dim.z-1);
+  t.z = s - sf;
+  
+  tp[0].x=t.x*t.x*t.x; tp[0].y=t.x*t.x; tp[0].z=t.x; tp[0].w=1.0;
+  tp[1].x=t.y*t.y*t.y; tp[1].y=t.y*t.y; tp[1].z=t.y; tp[1].w=1.0;
+  tp[2].x=t.z*t.z*t.z; tp[2].y=t.z*t.z; tp[2].z=t.z; tp[2].w=1.0;
+
+  // First 4 of a are value, second 4 are derivative, last four are
+  // second derivative.
+  __shared__ double a[12], b[12], c[12];
+  if (thr < 12) {
+    a[thr] = Bcuda[4*thr+0]*tp[0].x + Bcuda[4*thr+1]*tp[0].y + Bcuda[4*thr+2]*tp[0].z + Bcuda[4*thr+3]*tp[0].w;
+    b[thr] = Bcuda[4*thr+0]*tp[1].x + Bcuda[4*thr+1]*tp[1].y + Bcuda[4*thr+2]*tp[1].z + Bcuda[4*thr+3]*tp[1].w;
+    c[thr] = Bcuda[4*thr+0]*tp[2].x + Bcuda[4*thr+1]*tp[2].y + Bcuda[4*thr+2]*tp[2].z + Bcuda[4*thr+3]*tp[2].w;
+  }
+  __syncthreads();
+
+  __shared__ double abc[640];
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+
+  abc[(16*i+4*j+k)+0]   = a[i+0]*b[j+0]*c[k+0]; // val
+  abc[(16*i+4*j+k)+64]  = a[i+4]*b[j+0]*c[k+0]; // d/dx
+  abc[(16*i+4*j+k)+128] = a[i+0]*b[j+4]*c[k+0]; // d/dy
+  abc[(16*i+4*j+k)+192] = a[i+0]*b[j+0]*c[k+4]; // d/dz
+  abc[(16*i+4*j+k)+256] = a[i+8]*b[j+0]*c[k+0]; // d2/dx2
+  abc[(16*i+4*j+k)+320] = a[i+4]*b[j+4]*c[k+0]; // d2/dxdy
+  abc[(16*i+4*j+k)+384] = a[i+4]*b[j+0]*c[k+4]; // d2/dxdz
+  abc[(16*i+4*j+k)+448] = a[i+0]*b[j+8]*c[k+0]; // d2/dy2
+  abc[(16*i+4*j+k)+512] = a[i+0]*b[j+4]*c[k+4]; // d2/dydz
+  abc[(16*i+4*j+k)+576] = a[i+0]*b[j+0]*c[k+8]; // d2/dz2
+
+  __syncthreads();
+
+  double v = 0.0, g0=0.0,  g1=0.0, g2=0.0, 
+    h00=0.0, h01=0.0, h02=0.0, h11=0.0, h12=0.0, h22=0.0;
+
+  int n = 0;
+  const double *b0 = coefs + index.x*strides.x + index.y*strides.y + index.z*strides.z + off;
+  if (off < 2*N) {
+    for (int i=0; i<4; i++) {
+      for (int j=0; j<4; j++) {
+	const double *base = b0 + i*strides.x + j*strides.y;
+	for (int k=0; k<4; k++) {
+	  double c  = base[k*strides.z];
+	  v   += abc[n+  0] * c;
+	  g0  += abc[n+ 64] * c;
+	  g1  += abc[n+128] * c;
+	  g2  += abc[n+192] * c;
+	  h00 += abc[n+256] * c;
+	  h01 += abc[n+320] * c;
+	  h02 += abc[n+384] * c;
+	  h11 += abc[n+448] * c;
+	  h12 += abc[n+512] * c;
+	  h22 += abc[n+576] * c;
+	  n += 1;
+	}
+      }
+    }
+    g0 *= drInv.x; 
+    g1 *= drInv.y; 
+    g2 *= drInv.z; 
+    
+    h00 *= drInv.x * drInv.x;  
+    h01 *= drInv.x * drInv.y;  
+    h02 *= drInv.x * drInv.z;  
+    h11 *= drInv.y * drInv.y;  
+    h12 *= drInv.y * drInv.z;  
+    h22 *= drInv.z * drInv.z;  
+  
+    
+    //  __shared__ double buff[6*SPLINE_BLOCK_SIZE];
+    // Note, we can reuse abc, by replacing buff with abc.
+    myval[off] = v;
+  }
+
+  __shared__ double G[3][3], GGt[3][3];
+  int i0 = threadIdx.x/3;
+  int i1 = threadIdx.x - 3*i0;
+  if (threadIdx.x < 9) 
+    G[i0][i1] = Linv[threadIdx.x];
+  __syncthreads();
+  if (threadIdx.x < 9)   
+    GGt[i0][i1] = (G[0][i0]*G[0][i1] + 
+		   G[1][i0]*G[1][i1] + 
+		   G[2][i0]*G[2][i1]);
+  __syncthreads();
+  if (off < 2*N) {
+    // Store gradients back to global memory
+    mygrad_lapl[off+0*row_stride] = G[0][0]*g0 + G[0][1]*g1 + G[0][2]*g2;
+    mygrad_lapl[off+2*row_stride] = G[1][0]*g0 + G[1][1]*g1 + G[1][2]*g2;
+    mygrad_lapl[off+4*row_stride] = G[2][0]*g0 + G[2][1]*g1 + G[2][2]*g2;
+    
+    // Store laplacians back to global memory
+    // Hessian = H00 H01 H02 H11 H12 H22
+    // Matrix = [0 1 2]
+    //          [1 3 4]
+    //          [2 4 5]
+    // laplacian = Trace(GGt*Hessian)
+    mygrad_lapl[off+6*row_stride] = 
+      (GGt[0][0]*h00 + GGt[1][0]*h01 + GGt[2][0]*h02 +
+       GGt[0][1]*h01 + GGt[1][1]*h11 + GGt[2][1]*h12 +
+       GGt[0][2]*h02 + GGt[1][2]*h12 + GGt[2][2]*h22);
+  }
+}
+
+
+extern "C" void
+eval_multi_multi_UBspline_3d_z_vgl_cuda 
+(const multi_UBspline_3d_z_cuda *spline, double *pos_d, double *Linv_d, 
+ double *vals_d[], double *grad_lapl_d[], int num, int row_stride)
+{
+  dim3 dimBlock(SPLINE_BLOCK_SIZE);
+  dim3 dimGrid(2*spline->num_splines/SPLINE_BLOCK_SIZE, num);
+
+  if (2*spline->num_splines % SPLINE_BLOCK_SIZE)
+    dimGrid.x++;
+
+  eval_multi_multi_UBspline_3d_z_vgl_kernel<<<dimGrid,dimBlock>>>
+    (pos_d, spline->gridInv, (double*)spline->coefs, Linv_d, (double**)vals_d, 
+     (double**)grad_lapl_d, spline->dim, spline->stride, spline->num_splines, row_stride);
+
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf (stderr, "CUDA error in eval_multi_multi_UBspline_3d_z_vgl_cuda:\n  %s\n",
+	     cudaGetErrorString(err));
+    abort();
+  }
+}
+
+
+
+
+
+
+
+#endif
--- a/src/einspline/multi_bspline_eval_c.h
+++ b/src/einspline/multi_bspline_eval_c.h
@ -0,0 +1,115 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef MULTI_BSPLINE_EVAL_C_H
+#define MULTI_BSPLINE_EVAL_C_H
+
+#include <math.h>
+#include <stdio.h>
+#include "multi_bspline_structs.h"
+
+/************************************************************/
+/* 1D float-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_1d_c (const multi_UBspline_1d_c *spline,
+			  double x,
+			  complex_float* restrict vals);
+
+void
+eval_multi_UBspline_1d_c_vg (const multi_UBspline_1d_c *spline,
+			     double x,
+			     complex_float* restrict vals,
+			     complex_float* restrict grads);
+
+
+void
+eval_multi_UBspline_1d_c_vgl (const multi_UBspline_1d_c *spline,
+			      double x,
+			      complex_float* restrict vals,
+			      complex_float* restrict grads,
+			      complex_float* restrict lapl);
+
+
+void
+eval_multi_UBspline_1d_c_vgh (const multi_UBspline_1d_c *spline,
+			      double x,
+			      complex_float* restrict vals,
+			      complex_float* restrict grads,
+			      complex_float* restrict hess);
+
+
+/************************************************************/
+/* 2D float-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_2d_c (const multi_UBspline_2d_c *spline,
+			  double x, double y,
+			  complex_float* restrict vals);
+
+void
+eval_multi_UBspline_2d_c_vg (const multi_UBspline_2d_c *spline,
+			     double x, double y,
+			     complex_float* restrict vals,
+			     complex_float* restrict grads);
+
+void
+eval_multi_UBspline_2d_c_vgl (const multi_UBspline_2d_c *spline,
+			      double x, double y,
+			      complex_float* restrict vals,
+			      complex_float* restrict grads,
+			      complex_float* restrict lapl);
+
+void
+eval_multi_UBspline_2d_c_vgh (const multi_UBspline_2d_c *spline,
+			      double x, double y,
+			      complex_float* restrict vals,
+			      complex_float* restrict grads,
+			      complex_float* restrict hess);
+
+
+/************************************************************/
+/* 3D float-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_3d_c (const multi_UBspline_3d_c *spline,
+			  double x, double y, double z,
+			  complex_float* restrict vals);
+
+void
+eval_multi_UBspline_3d_c_vg (const multi_UBspline_3d_c *spline,
+			      double x, double y, double z,
+			      complex_float* restrict vals,
+			     complex_float* restrict grads);
+
+void
+eval_multi_UBspline_3d_c_vgl (const multi_UBspline_3d_c *spline,
+			      double x, double y, double z,
+			      complex_float* restrict vals,
+			      complex_float* restrict grads,
+			      complex_float* restrict lapl);
+
+void
+eval_multi_UBspline_3d_c_vgh (const multi_UBspline_3d_c *spline,
+			      double x, double y, double z,
+			      complex_float* restrict vals,
+			      complex_float* restrict grads,
+			      complex_float* restrict hess);
+#endif
--- a/src/einspline/multi_bspline_eval_cuda.h
+++ b/src/einspline/multi_bspline_eval_cuda.h
@ -0,0 +1,140 @@
+#ifndef MULTI_BSPLINE_EVAL_CUDA_H
+#define MULTI_BSPLINE_EVAL_CUDA_H
+
+#include "multi_bspline_structs_cuda.h"
+
+
+////////
+// 1D //
+////////
+
+// Single-precision real
+extern "C" void
+eval_multi_multi_UBspline_1d_s_cuda 
+(const multi_UBspline_1d_s_cuda *spline, float *pos_d, float *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_1d_s_vgl_cuda
+(const multi_UBspline_1d_s_cuda *spline, float *pos_d, 
+ float *vals_d[], float *grads_d[], float *lapl_d[], int num);
+
+// Double-precision real
+extern "C" void
+eval_multi_multi_UBspline_1d_d_cuda 
+(const multi_UBspline_1d_d_cuda *spline, double *pos_d, double *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_1d_d_vgl_cuda
+(const multi_UBspline_1d_d_cuda *spline, double *pos_d, 
+ double *vals_d[], double *grad_lapl_d[], int num, int row_stride);
+
+
+// Single-precision complex
+extern "C" void
+eval_multi_multi_UBspline_1d_c_cuda 
+(const multi_UBspline_1d_c_cuda *spline, 
+ float *pos_d, complex_float *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_1d_c_vgl_cuda
+(const multi_UBspline_1d_c_cuda *spline, float *pos_d, 
+ complex_float *vals_d[], complex_float *grad_lapl_d[], int num, int row_stride);
+
+
+// Doublele-precision complex
+extern "C" void
+eval_multi_multi_UBspline_1d_z_cuda 
+(const multi_UBspline_1d_z_cuda *spline, 
+ double *pos_d, complex_double *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_1d_z_vgl_cuda
+(const multi_UBspline_1d_z_cuda *spline, double *pos_d, 
+ complex_double *vals_d[], complex_double *grad_lapl_d[], int num, int row_stride);
+
+
+
+
+////////
+// 3D //
+////////
+// Single-precision real
+extern "C" void
+eval_multi_multi_UBspline_3d_s_cuda 
+(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_sign_cuda 
+(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, 
+ float *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_vgh_cuda 
+(const multi_UBspline_3d_s_cuda *spline,
+ float *pos_d, float *vals_d[], float *grads_d[], float *hess_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_vgl_cuda
+(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_s_vgl_sign_cuda
+(const multi_UBspline_3d_s_cuda *spline, float *pos_d, float *sign_d, float *Linv_d, 
+ float *vals_d[], float *grad_lapl_d[], int num, int row_stride);
+
+
+
+// Double-precision real
+extern "C" void
+eval_multi_multi_UBspline_3d_d_cuda 
+(const multi_UBspline_3d_d_cuda *spline, double *pos_d, double *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_d_vgh_cuda 
+(const multi_UBspline_3d_d_cuda *spline,
+ double *pos_d, double *vals_d[], double *grads_d[], double *hess_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_d_vgl_cuda
+(const multi_UBspline_3d_d_cuda *spline, double *pos_d, double *Linv_d, 
+ double *vals_d[], double *grad_lapl_d[], int num, int row_stride);
+
+
+// Single-precision complex
+extern "C" void
+eval_multi_multi_UBspline_3d_c_cuda 
+(const multi_UBspline_3d_c_cuda *spline, 
+ float *pos_d, complex_float *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_c_vgh_cuda 
+(const multi_UBspline_3d_c_cuda *spline, float *pos_d, 
+ complex_float *vals_d[], complex_float *grads_d[], 
+ complex_float *hess_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_c_vgl_cuda
+(const multi_UBspline_3d_c_cuda *spline, float *pos_d, float *Linv_d, 
+ complex_float *vals_d[], complex_float *grad_lapl_d[], int num, int row_stride);
+
+
+// Doublele-precision complex
+extern "C" void
+eval_multi_multi_UBspline_3d_z_cuda 
+(const multi_UBspline_3d_z_cuda *spline, 
+ double *pos_d, complex_double *vals_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_z_vgh_cuda 
+(const multi_UBspline_3d_z_cuda *spline, double *pos_d, 
+ complex_double *vals_d[], complex_double *grads_d[], 
+ complex_double *hess_d[], int num);
+
+extern "C" void
+eval_multi_multi_UBspline_3d_z_vgl_cuda
+(const multi_UBspline_3d_z_cuda *spline, double *pos_d, double *Linv_d, 
+ complex_double *vals_d[], complex_double *grad_lapl_d[], int num, int row_stride);
+
+
+#endif
--- a/src/einspline/multi_bspline_eval_cuda_c.cu
+++ b/src/einspline/multi_bspline_eval_cuda_c.cu
@ -0,0 +1,459 @@
+#define BLOCK_SIZE 64
+
+#include <stdio.h>
+#include <pthread.h>
+#include <cuda.h>
+#include <cutil.h>
+#include <multithreading.h>
+
+__global__ void 
+eval_multi_UBspline_3d_cuda_c (const float *coefs, float *abc, float *vals,
+			       int ix, int iy, int iz,
+			       int xs, int ys, int zs, int N)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int offset = block*BLOCK_SIZE+thr;
+  __shared__ float abcs[64];
+  abcs[thr] = abc[thr];
+  
+  __syncthreads();
+
+  float val= 0.0;
+  //int index=0;
+  for (int i=0; i<4; i++)
+    for (int j=0; j<4; j++) {
+      for (int k=0; k<4; k++) {	
+	float *base_addr = coefs + (ix+i)*xs + (iy+j)*ys + (iz+k)*zs;
+	//val += abc[(16*i+4*j+k)*BLOCK_SIZE + thr] * base_addr[offset];
+	val += abcs[16*i+4*j+k] * base_addr[offset];	
+	//index++;
+      }
+    }
+  vals[offset] = val;
+}
+
+
+__constant__ float A[16], dA[16], d2A[16];
+
+__global__ static void
+eval_multi_multi_UBspline_3d_cuda_c (float *pos, float3 drInv, 
+				     const float *coefs_real,const  float *coefs_imag,
+				     float *vals_real, float *vals_imag, 
+				     int3 strides)
+{
+  int block = blockIdx.x;
+  int thr   = threadIdx.x;
+  int ir    = blockIdx.y;
+  int offset = block*BLOCK_SIZE+thr;
+
+  __shared__ float abc[64];
+
+  __shared__ float pos_s[BLOCK_SIZE];
+  int ir1 = (ir >> 4)*64;
+  int ir2 = (ir & 15)*4;
+  pos_s[thr] = pos[ir1+thr];
+  __syncthreads();
+  float3 r;
+  r.x = pos_s[ir2+0];
+  r.y = pos_s[ir2+1];
+  r.z = pos_s[ir2+2];
+  
+  int3 index;
+  float3 t;
+  float s, sf;
+  float4 tp[3];
+
+  s = r.x * drInv.x;
+  sf = floor(s);
+  index.x = (int)sf;
+  t.x = s - sf;
+
+  s = r.y * drInv.y;
+  sf = floor(s);
+  index.y = (int)sf;
+  t.y = s - sf;
+
+  s = r.z * drInv.z;
+  sf = floor(s);
+  index.z = (int)sf;
+  t.z = s - sf;
+  
+  tp[0] = make_float4(1.0, t.x, t.x*t.x, t.x*t.x*t.x);
+  tp[1] = make_float4(1.0, t.y, t.y*t.y, t.y*t.y*t.y);
+  tp[2] = make_float4(1.0, t.z, t.z*t.z, t.z*t.z*t.z);
+
+  __shared__ float a[4], b[4], c[4];
+  if (thr == 0) {
+    a[0] = A[ 0]*tp[0].x + A[ 1]*tp[0].y + A[ 2]*tp[0].z + A[ 3]*tp[0].w;
+    a[1] = A[ 4]*tp[0].x + A[ 5]*tp[0].y + A[ 6]*tp[0].z + A[ 7]*tp[0].w;
+    a[2] = A[ 8]*tp[0].x + A[ 9]*tp[0].y + A[10]*tp[0].z + A[11]*tp[0].w;
+    a[3] = A[12]*tp[0].x + A[13]*tp[0].y + A[14]*tp[0].z + A[15]*tp[0].w;
+    
+    b[0] = A[ 0]*tp[1].x + A[ 1]*tp[1].y + A[ 2]*tp[1].z + A[ 3]*tp[1].w;
+    b[1] = A[ 4]*tp[1].x + A[ 5]*tp[1].y + A[ 6]*tp[1].z + A[ 7]*tp[1].w;
+    b[2] = A[ 8]*tp[1].x + A[ 9]*tp[1].y + A[10]*tp[1].z + A[11]*tp[1].w;
+    b[3] = A[12]*tp[1].x + A[13]*tp[1].y + A[14]*tp[1].z + A[15]*tp[1].w;
+    
+    c[0] = A[ 0]*tp[2].x + A[ 1]*tp[2].y + A[ 2]*tp[2].z + A[ 3]*tp[2].w;
+    c[1] = A[ 4]*tp[2].x + A[ 5]*tp[2].y + A[ 6]*tp[2].z + A[ 7]*tp[2].w;
+    c[2] = A[ 8]*tp[2].x + A[ 9]*tp[2].y + A[10]*tp[2].z + A[11]*tp[2].w;
+    c[3] = A[12]*tp[2].x + A[13]*tp[2].y + A[14]*tp[2].z + A[15]*tp[2].w;
+  }
+
+  int i = (thr>>4)&3;
+  int j = (thr>>2)&3;
+  int k = (thr & 3);
+  
+  abc[thr] = a[i]*b[j]*c[k];
+  __syncthreads();
+
+  float val_real = 0.0;
+  float val_imag = 0.0;
+  //int index=0;
+  val_real = val_imag = 0.0;
+//   int di = strides.x - 4*strides.y;
+//   int dj = strides.y - 4*strides.z;
+  for (int i=0; i<4; i++) {
+    for (int j=0; j<4; j++) {
+      float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + index.z*strides.z;
+      for (int k=0; k<4; k++) {
+	// 	float *base_real = coefs_real + (index.x+i)*strides.x + (index.y+j)*strides.y + (index.z+k)*strides.z;
+	// 	float *base_imag = coefs_imag + (index.x+i)*strides.x + (index.y+j)*strides.y + (index.z+k)*strides.z;
+	val_real += abc[16*i+4*j+k] * base_real[offset+k*strides.z];
+	val_imag += abc[16*i+4*j+k] * base_imag[offset+k*strides.z];
+ 	// base_real += strides.z;
+ 	// base_imag += strides.z;
+      }
+//       base_real += dj;
+//       base_imag += dj;
+    }
+//     base_real += di;
+//     base_imag += di;
+  }
+  vals_real[offset+ir*128] = val_real;
+  vals_imag[offset+ir*128] = val_imag;
+  //vals_real[ir][offset] = val_real;
+  // vals_imag[ir][offset] = val_imag;
+}
+				    
+
+
+// __global__ void 
+// eval_multi_UBspline_3d_cuda_c2 (float3 r,
+// 				float *coefs, float *vals,
+// 				int xs, int ys, int zs, int N)
+// {
+//   int block = blockIdx.x;
+//   int thr   = threadIdx.x;
+
+//   __shared__ float abcs[64];
+//   abcs[thr] = abc[thr];
+
+//   float dxInv = 0.0625f;
+//   float v, dv;
+
+//   v = floor(dxInv*r.x);
+//   dv = dxInv*r.x - v;
+//   int ix = (int) v;
+
+//   v = floor(dxInv*r.x);
+//   dv = dxInv*r.x - v;
+//   int iy = (int) v;
+
+//   v = floor(dxInv*r.y);
+//   dv = dxInv*r.y - v;
+//   int iz = (int) v;
+
+//   int offset = block*BLOCK_SIZE+thr;
+//   __shared__ float abcs[64];
+//   abcs[thr] = abc[thr];
+  
+
+//   float val= 0.0;
+//   //int index=0;
+//   val = 0.0;
+//   for (int i=0; i<4; i++)
+//     for (int j=0; j<4; j++)
+//       for (int k=0; k<4; k++) {
+// 	float *base_addr = coefs + (ix+i)*xs + (iy+j)*ys + (iz+k)*zs;
+// 	//val += abc[(16*i+4*j+k)*BLOCK_SIZE + thr] * base_addr[offset];
+// 	val += abcs[16*i+4*j+k] * base_addr[offset];	
+// 	//index++;
+//       }
+//   vals[offset] = val;
+// }
+
+
+void
+test_cuda()
+{
+  float *coefs  , *abc  , *abc2, *vals;
+  float *coefs_d, *abc_d, *vals_d;
+  int xs, ys, zs, N;
+  int Nx, Ny, Nz;
+
+  N = 4096;
+  Nx = Ny = Nz = 16;
+  xs = Nx*Ny*Nz;
+  ys = Ny*Nz;
+  zs = Nz;
+  
+  int size = Nx*Ny*Nz*N*sizeof(float);
+  posix_memalign((void**)&coefs, 16, size);
+  cudaMalloc((void**)&coefs_d, size);
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++)
+	for (int n=0; n<N; n++)
+	  coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
+  cudaMemcpy(coefs_d, coefs, size, cudaMemcpyHostToDevice);
+
+  posix_memalign ((void**)&abc, 16, 64*sizeof(float));
+  posix_memalign ((void**)&abc2, 16, 64*BLOCK_SIZE*sizeof(float));
+  cudaMalloc((void**)&abc_d, 64*BLOCK_SIZE*sizeof(float));
+  for (int i=0; i<64; i++) {
+    abc[i] = drand48();
+    for (int j=0; j<BLOCK_SIZE; j++)
+      abc2[i*BLOCK_SIZE+j] = abc[i];
+  }
+  //  cudaMemcpy(abc_d, abc2, 64*BLOCK_SIZE*sizeof(float), 
+  //     cudaMemcpyHostToDevice);
+  cudaMemcpy(abc_d, abc, 64*sizeof(float), 
+	     cudaMemcpyHostToDevice);
+
+  posix_memalign((void**)&vals, 16, N*sizeof(float));
+  cudaMalloc((void**)&vals_d, N*sizeof(float));
+
+  dim3 dimBlock(BLOCK_SIZE);
+  dim3 dimGrid(N/BLOCK_SIZE);
+
+  int ix=1; 
+  int iy=2;
+  int iz=3;
+  
+  clock_t start, end;
+  start = clock();
+  for (int i=0; i<100000; i++) {
+    eval_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>> 
+      (coefs_d, abc_d, vals_d, ix, iy, iz, xs, ys, zs, N);
+  }
+  end = clock();
+  double time = (double)(end-start)/(double)(CLOCKS_PER_SEC*100000*N);
+  fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
+
+  cudaMemcpy (vals, vals_d, N*sizeof(float), cudaMemcpyDeviceToHost);
+
+  float vals2[N];
+  
+  for (int n=0; n<N; n++) {
+    vals2[n] = 0.0;
+    int index=0;
+    for(int i=0; i<4; i++)
+      for (int j=0; j<4; j++)
+	for (int k=0; k<4; k++)  {
+	  vals2[n] += abc[index] * coefs[(ix+i)*xs+(iy+j)*ys+(iz+k)*zs+n];
+	  index++;
+	}
+  }
+
+
+  for (int i=0; i<N/256; i++)	
+    fprintf (stderr, "%1.9f %1.9f\n", vals[i], vals2[i]); 
+
+
+  cudaFree(abc_d);
+  cudaFree(coefs_d);
+  cudaFree(vals_d);
+}
+
+
+static void *
+test_multi_cuda(void *thread)
+{
+//   CUcontext ctx;
+//   CUdevice dev;
+//   cuDeviceGet (&dev, (int)(size_t)thread);
+//   cuCtxCreate(&ctx, CU_CTX_SCHED_YIELD, dev);
+
+//   int deviceCount;
+//   cudaGetDeviceCount(&deviceCount);
+
+  CUDA_SAFE_CALL(cudaSetDevice((int)(size_t)thread));
+  fprintf (stderr, "In thread %p\n", thread);
+
+
+  int numWalkers = 2000;
+  float *coefs  ,  __device__ *vals_real[numWalkers],   __device__ *vals_imag[numWalkers];
+  float *coefs_real_d, *coefs_imag_d, __device__ *vals_real_d[numWalkers], __device__ *vals_imag_d[numWalkers];
+  float *r_d, *r_h;
+  int xs, ys, zs, N;
+  int Nx, Ny, Nz;
+
+  N = 128;
+  Nx = Ny = Nz = 64;
+  xs = Ny*Nz*N;
+  ys = Nz*N;
+  zs = N;
+
+  float3 drInv;
+  drInv.x = 1.0/float(Nx);
+  drInv.y = 1.0/float(Ny);
+  drInv.z = 1.0/float(Nz);
+
+  // Setup Bspline coefficients
+  int size = Nx*Ny*Nz*N*sizeof(float);
+  CUT_SAFE_MALLOC(posix_memalign((void**)&coefs, 16, size));
+  for (int ix=0; ix<Nx; ix++)
+    for (int iy=0; iy<Ny; iy++)
+      for (int iz=0; iz<Nz; iz++)
+	for (int n=0; n<N; n++)
+	  coefs[ix*xs + iy*ys + iz*zs + n] = drand48();
+
+
+  fprintf (stderr, "Filled in coefs.\n");
+
+  // Setup values
+  //posix_memalign((void**)&vals, 16, N*sizeof(float));
+
+  // cudaMemcpy(r_d, r, numWalkers*sizeof(float3), cudaMemcpyHostToDevice);
+
+  
+  fprintf (stderr, "size = %d\n", size);
+  
+  // Setup CUDA coefficients
+  fprintf (stderr, "Before first CUDA mallocs.\n");
+  CUDA_SAFE_CALL(cudaMalloc((void**)&coefs_real_d, size));
+  CUDA_SAFE_CALL(cudaMalloc((void**)&coefs_imag_d, size));
+  fprintf (stderr, "Before Memcpy.\n");
+  CUDA_SAFE_CALL(cudaMemcpy(coefs_real_d, coefs, size, cudaMemcpyHostToDevice));
+  CUDA_SAFE_CALL(cudaMemcpy(coefs_imag_d, coefs, size, cudaMemcpyHostToDevice));
+  fprintf (stderr, "After Memcpy.\n");  
+
+  // Setup device value storage
+  int numVals = 2*N*numWalkers;
+  float *valBlock_d, *valBlock_h;
+  CUDA_SAFE_CALL(cudaMalloc((void**)&(valBlock_d), numVals*sizeof(float)));
+  CUDA_SAFE_CALL(cudaMallocHost((void**)&(valBlock_h), numVals*sizeof(float)));
+  CUDA_SAFE_CALL(cudaMalloc((void**)&(vals_real_d), numWalkers*sizeof(float*)));
+  CUDA_SAFE_CALL(cudaMalloc((void**)&(vals_imag_d), numWalkers*sizeof(float*)));
+  fprintf (stderr, "valBlock_d = %p\n", valBlock_d);
+  for (int i=0; i<numWalkers; i++) {
+    vals_real[i] = valBlock_d + 2*i*N;
+    vals_imag[i] = valBlock_d + (2*i+1)*N;
+  }
+  CUDA_SAFE_CALL(cudaMemcpy(vals_real_d, vals_real, numWalkers*sizeof(float*), cudaMemcpyHostToDevice));
+  CUDA_SAFE_CALL(cudaMemcpy(vals_imag_d, vals_imag, numWalkers*sizeof(float*), cudaMemcpyHostToDevice));
+  
+  fprintf (stderr, "Finished cuda allocations.\n");
+
+
+  // Setup walker positions
+  CUDA_SAFE_CALL(cudaMalloc((void**)&(r_d),     4*numWalkers*sizeof(float)));
+  CUDA_SAFE_CALL(cudaMallocHost((void**)&(r_h), 4*numWalkers*sizeof(float)));
+
+  for (int ir=0; ir<numWalkers; ir++) {
+    r_h[4*ir+0] = 0.75*drand48();
+    r_h[4*ir+1] = 0.75*drand48();
+    r_h[4*ir+2] = 0.75*drand48();
+  }
+
+  
+  int3 strides;
+  strides.x = xs;
+  strides.y = ys;
+  strides.z = zs;
+
+
+  dim3 dimBlock(BLOCK_SIZE);
+  dim3 dimGrid(N/BLOCK_SIZE,numWalkers);
+  
+  clock_t start, end;
+  start = clock();
+
+  for (int i=0; i<10000; i++) {
+    if ((i%1000) == 0) 
+      fprintf (stderr, "i = %d\n", i);
+    CUDA_SAFE_CALL(cudaMemcpy(r_d, r_h, 4*numWalkers*sizeof(float), cudaMemcpyHostToDevice));
+    // eval_multi_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>> 
+    //   (r_d, drInv, coefs_real_d, coefs_imag_d, 
+    //    vals_real_d, vals_imag_d, strides);
+    eval_multi_multi_UBspline_3d_cuda_c<<<dimGrid,dimBlock>>> 
+      (r_d, drInv, coefs_real_d, coefs_imag_d, 
+       valBlock_d, valBlock_d+numVals/2, strides);
+    //cudaMemcpy(valBlock_h, valBlock_d, numVals*sizeof(float), cudaMemcpyDeviceToHost);
+  }
+  end = clock();
+  double time = (double)(end-start)/(double)((double)CLOCKS_PER_SEC*(double)10000*N*numWalkers);
+  fprintf (stderr, "Evals per second = %1.8e\n", 1.0/time);
+  
+  cudaFree (valBlock_d);
+  cudaFree (vals_real_d);
+  cudaFree (vals_imag_d);
+  cudaFree (coefs_real_d);
+  cudaFree (coefs_imag_d);
+  cudaFree (r_d);
+
+  return NULL;
+
+  // cudaMemcpy (vals, vals_d, N*sizeof(float), cudaMemcpyDeviceToHost);
+
+  // float vals2[N];
+  
+  // for (int n=0; n<N; n++) {
+  //   vals2[n] = 0.0;
+  //   int index=0;
+  //   for(int i=0; i<4; i++)
+  //     for (int j=0; j<4; j++)
+  // 	for (int k=0; k<4; k++)  {
+  // 	  vals2[n] += abc[index] * coefs[(ix+i)*xs+(iy+j)*ys+(iz+k)*zs+n];
+  // 	  index++;
+  // 	}
+  // }
+
+
+  // for (int i=0; i<N/256; i++)	
+  //   fprintf (stderr, "%1.9f %1.9f\n", vals[i], vals2[i]); 
+
+
+  // cudaFree(abc_d);
+  // cudaFree(coefs_d);
+  // cudaFree(vals_d);
+}
+
+
+
+
+main()
+{
+  int deviceCount;
+  cudaGetDeviceCount(&deviceCount);
+  fprintf (stderr, "Detected %d CUDA devices.\n", deviceCount);
+
+  // test_cuda();
+
+  for (int device = 0; device < deviceCount; ++device) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, device);
+    fprintf (stderr, "Device %d:\n", device);
+    fprintf (stderr, "  Global memory:     %10d\n",
+	     deviceProp.totalGlobalMem);
+    fprintf (stderr, "  MultiProcessors:   %10d\n",
+	     deviceProp.multiProcessorCount);
+    fprintf (stderr, "  Registers:         %10d\n", 
+	     deviceProp.regsPerBlock);
+    fprintf (stderr, "  Constant memory:   %10d\n", 
+	     deviceProp.totalConstMem);
+  }
+
+  //  pthread_t threads[deviceCount];
+
+  // for (int device = 0; device < deviceCount; device++) 
+  //   pthread_create (&(threads[device]), NULL, test_multi_cuda, (void*)device);
+  // cutStartThread((CUT_THREADROUTINE)test_multi_cuda,(void*)device);
+  test_multi_cuda((void*)0);
+
+  //  pthread_exit(NULL);
+  //test_multi_cuda();
+}
--- a/src/einspline/multi_bspline_eval_d.h
+++ b/src/einspline/multi_bspline_eval_d.h
@ -0,0 +1,120 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef MULTI_BSPLINE_EVAL_D_H
+#define MULTI_BSPLINE_EVAL_D_H
+
+#include <math.h>
+#include <stdio.h>
+#include "multi_bspline_structs.h"
+
+/************************************************************/
+/* 1D double-precision, real evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_1d_d (const multi_UBspline_1d_d *spline,
+			  double x,
+			  double* restrict vals);
+
+void
+eval_multi_UBspline_1d_d_vg (const multi_UBspline_1d_d *spline,
+			     double x,
+			     double* restrict vals,
+			     double* restrict grads);
+
+void
+eval_multi_UBspline_1d_d_vgl (const multi_UBspline_1d_d *spline,
+			      double x,
+			      double* restrict vals,
+			      double* restrict grads,
+			      double* restrict lapl);
+
+void
+eval_multi_UBspline_1d_d_vgh (const multi_UBspline_1d_d *spline,
+			      double x,
+			      double* restrict vals,
+			      double* restrict grads,
+			      double* restrict hess);
+
+/************************************************************/
+/* 2D double-precision, real evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_2d_d (const multi_UBspline_2d_d *spline,
+			  double x, double y,
+			  double* restrict vals);
+
+void
+eval_multi_UBspline_2d_d_vg (const multi_UBspline_2d_d *spline,
+			     double x, double y,
+			     double* restrict vals,
+			     double* restrict grads);
+
+void
+eval_multi_UBspline_2d_d_vgl (const multi_UBspline_2d_d *spline,
+			      double x, double y,
+			      double* restrict vals,
+			      double* restrict grads,
+			      double* restrict lapl);
+
+void
+eval_multi_UBspline_2d_d_vgh (const multi_UBspline_2d_d *spline,
+			      double x, double y,
+			      double* restrict vals,
+			      double* restrict grads,
+			      double* restrict hess);
+
+/************************************************************/
+/* 3D double-precision, real evaulation functions           */
+/************************************************************/
+void
+eval_multi_UBspline_3d_d (const multi_UBspline_3d_d *spline,
+			  double x, double y, double z,
+			  double* restrict vals);
+
+void
+eval_multi_UBspline_3d_d_vg (const multi_UBspline_3d_d *spline,
+			     double x, double y, double z,
+			     double* restrict vals,
+			     double* restrict grads);
+
+void
+eval_multi_UBspline_3d_d_vgl (const multi_UBspline_3d_d *spline,
+			      double x, double y, double z,
+			      double* restrict vals,
+			      double* restrict grads,
+			      double* restrict lapl);
+
+void
+eval_multi_UBspline_3d_d_vgh (const multi_UBspline_3d_d *spline,
+			      double x, double y, double z,
+			      double* restrict vals,
+			      double* restrict grads,
+			      double* restrict hess);
+
+void
+eval_multi_UBspline_3d_d_vghgh (const multi_UBspline_3d_d *spline,
+    double x, double y, double z,
+    double* restrict vals,
+    double* restrict grads,
+    double* restrict hess,
+    double* restrict gradhess);
+
+#endif
--- a/src/einspline/multi_bspline_eval_s.h
+++ b/src/einspline/multi_bspline_eval_s.h
@ -0,0 +1,110 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef MULTI_BSPLINE_EVAL_S_H
+#define MULTI_BSPLINE_EVAL_S_H
+
+#include "multi_bspline_structs.h"
+
+/************************************************************/
+/* 1D single-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_1d_s (const multi_UBspline_1d_s *spline,
+			  double x,
+			  float* restrict vals);
+
+void
+eval_multi_UBspline_1d_s_vg (const multi_UBspline_1d_s *spline,
+			     double x,
+			     float* restrict vals,
+			     float* restrict grads);
+
+void
+eval_multi_UBspline_1d_s_vgl (const multi_UBspline_1d_s *spline,
+			      double x,
+			      float* restrict vals,
+			      float* restrict grads,
+			      float* restrict lapl);
+
+
+void
+eval_multi_UBspline_1d_s_vgh (const multi_UBspline_1d_s *spline,
+			      double x,
+			      float* restrict vals,
+			      float* restrict grads,
+			      float* restrict hess);
+
+/************************************************************/
+/* 2D single-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_2d_s(const multi_UBspline_2d_s *spline,
+			 double x, double y,
+			 float* restrict vals);
+
+void
+eval_multi_UBspline_2d_s_vg (const multi_UBspline_2d_s *spline,
+			     double x, double y,
+			     float* restrict vals,
+			     float* restrict grads);
+
+void
+eval_multi_UBspline_2d_s_vgl (const multi_UBspline_2d_s *spline,
+			      double x, double y,
+			      float* restrict vals,
+			      float* restrict grads,
+			      float* restrict lapl);
+
+void
+eval_multi_UBspline_2d_s_vgh (const multi_UBspline_2d_s *spline,
+			      double x, double y,
+			      float* restrict vals,
+			      float* restrict grads,
+			      float* restrict hess);
+
+/************************************************************/
+/* 3D single-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_3d_s (const multi_UBspline_3d_s *spline,
+			  double x, double y, double z,
+			  float* restrict vals);
+
+void
+eval_multi_UBspline_3d_s_vg (const multi_UBspline_3d_s *spline,
+			     double x, double y, double z,
+			     float* restrict vals,
+			     float* restrict grads);
+
+void
+eval_multi_UBspline_3d_s_vgl (const multi_UBspline_3d_s *spline,
+			      double x, double y, double z,
+			      float* restrict vals,
+			      float* restrict grads,
+			      float* restrict lapl);
+
+void
+eval_multi_UBspline_3d_s_vgh (const multi_UBspline_3d_s *spline,
+			      double x, double y, double z,
+			      float* restrict vals,
+			      float* restrict grads,
+			      float* restrict hess);
+#endif
--- a/src/einspline/multi_bspline_eval_sse2_d.c
+++ b/src/einspline/multi_bspline_eval_sse2_d.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std3_d_impl.h"
--- a/src/einspline/multi_bspline_eval_sse2_d_cpp.cc
+++ b/src/einspline/multi_bspline_eval_sse2_d_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std3_d_impl.h"
--- a/src/einspline/multi_bspline_eval_sse2_d_impl.h
+++ b/src/einspline/multi_bspline_eval_sse2_d_impl.h
--- a/src/einspline/multi_bspline_eval_sse_c.c
+++ b/src/einspline/multi_bspline_eval_sse_c.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_c_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_c.h
+++ b/src/einspline/multi_bspline_eval_sse_c.h
--- a/src/einspline/multi_bspline_eval_sse_c_cpp.cc
+++ b/src/einspline/multi_bspline_eval_sse_c_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_c_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_c_impl.h
+++ b/src/einspline/multi_bspline_eval_sse_c_impl.h
--- a/src/einspline/multi_bspline_eval_sse_d.c
+++ b/src/einspline/multi_bspline_eval_sse_d.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_d_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_d.h
+++ b/src/einspline/multi_bspline_eval_sse_d.h
--- a/src/einspline/multi_bspline_eval_sse_d_cpp.cc
+++ b/src/einspline/multi_bspline_eval_sse_d_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_d_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_d_impl.h
+++ b/src/einspline/multi_bspline_eval_sse_d_impl.h
--- a/src/einspline/multi_bspline_eval_sse_s.c
+++ b/src/einspline/multi_bspline_eval_sse_s.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_s_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_s.h
+++ b/src/einspline/multi_bspline_eval_sse_s.h
--- a/src/einspline/multi_bspline_eval_sse_s_cpp.cc
+++ b/src/einspline/multi_bspline_eval_sse_s_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_s_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_s_impl.h
+++ b/src/einspline/multi_bspline_eval_sse_s_impl.h
--- a/src/einspline/multi_bspline_eval_sse_z.c
+++ b/src/einspline/multi_bspline_eval_sse_z.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_z_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_z.h
+++ b/src/einspline/multi_bspline_eval_sse_z.h
@ -0,0 +1,119 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#ifndef MULTI_BSPLINE_EVAL_SSE_Z_H
+#define MULTI_BSPLINE_EVAL_SSE_Z_H
+
+
+/************************************************************/
+/* 1D double-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_1d_z (multi_UBspline_1d_z *spline,
+			  double x,
+			  complex_double* restrict vals);
+
+void
+eval_multi_UBspline_1d_z_vg (multi_UBspline_1d_z *spline,
+			     double x,
+			     complex_double* restrict vals,
+			     complex_double* restrict grads);
+
+void
+eval_multi_UBspline_1d_z_vgl (multi_UBspline_1d_z *spline,
+			      double x,
+			      complex_double* restrict vals,
+			      complex_double* restrict grads,
+			      complex_double* restrict lapl);
+
+
+void
+eval_multi_UBspline_1d_z_vgh (multi_UBspline_1d_z *spline,
+			      double x,
+			      complex_double* restrict vals,
+			      complex_double* restrict grads,
+			      complex_double* restrict hess);
+
+
+/************************************************************/
+/* 2D double-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_2d_z (multi_UBspline_2d_z *spline,
+			  double x, double y,
+			  complex_double* restrict vals);
+
+void
+eval_multi_UBspline_2d_z_vg (multi_UBspline_2d_z *spline,
+			     double x, double y,
+			     complex_double* restrict vals,
+			     complex_double* restrict grads);
+
+void
+eval_multi_UBspline_2d_z_vgl (multi_UBspline_2d_z *spline,
+			      double x, double y,
+			      complex_double* restrict vals,
+			      complex_double* restrict grads,
+			      complex_double* restrict lapl);
+
+void
+eval_multi_UBspline_2d_z_vgh (multi_UBspline_2d_z *spline,
+			      double x, double y,
+			      complex_double* restrict vals,
+			      complex_double* restrict grads,
+			      complex_double* restrict hess);
+
+/************************************************************/
+/* 3D double-precision, complex evaulation functions        */
+/************************************************************/
+void
+eval_multi_UBspline_3d_z (multi_UBspline_3d_z *spline,
+			  double x, double y, double z,
+			  complex_double* restrict vals);
+
+void
+eval_multi_UBspline_3d_z_vg (multi_UBspline_3d_z *spline,
+			     double x, double y, double z,
+			     complex_double* restrict vals,
+			     complex_double* restrict grads);
+
+void
+eval_multi_UBspline_3d_z_vgl (multi_UBspline_3d_z *spline,
+			      double x, double y, double z,
+			      complex_double* restrict vals,
+			      complex_double* restrict grads,
+			      complex_double* restrict lapl);
+
+void
+eval_multi_UBspline_3d_z_vgh (multi_UBspline_3d_z *spline,
+			      double x, double y, double z,
+			      complex_double* restrict vals,
+			      complex_double* restrict grads,
+			      complex_double* restrict hess);
+
+void
+eval_multi_UBspline_3d_z_vghgh (multi_UBspline_3d_z *spline,
+               double x, double y, double z,
+               complex_double* restrict vals,
+               complex_double* restrict grads,
+               complex_double* restrict hess,
+               complex_double* restrict gradhess);               
+
+#endif
--- a/src/einspline/multi_bspline_eval_sse_z_cpp.cc
+++ b/src/einspline/multi_bspline_eval_sse_z_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_sse_z_impl.h"
--- a/src/einspline/multi_bspline_eval_sse_z_impl.h
+++ b/src/einspline/multi_bspline_eval_sse_z_impl.h
--- a/src/einspline/multi_bspline_eval_sse_z_unrolled.h
+++ b/src/einspline/multi_bspline_eval_sse_z_unrolled.h
--- a/src/einspline/multi_bspline_eval_std2_d.c
+++ b/src/einspline/multi_bspline_eval_std2_d.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std2_d_impl.h"
--- a/src/einspline/multi_bspline_eval_std2_d_cpp.cc
+++ b/src/einspline/multi_bspline_eval_std2_d_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std2_d_impl.h"
--- a/src/einspline/multi_bspline_eval_std2_d_impl.h
+++ b/src/einspline/multi_bspline_eval_std2_d_impl.h
--- a/src/einspline/multi_bspline_eval_std3_d.c
+++ b/src/einspline/multi_bspline_eval_std3_d.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std3_d_impl.h"
--- a/src/einspline/multi_bspline_eval_std3_d_cpp.cc
+++ b/src/einspline/multi_bspline_eval_std3_d_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std3_d_impl.h"
--- a/src/einspline/multi_bspline_eval_std3_d_impl.h
+++ b/src/einspline/multi_bspline_eval_std3_d_impl.h
--- a/src/einspline/multi_bspline_eval_std_c.c
+++ b/src/einspline/multi_bspline_eval_std_c.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std_c_impl.h"
--- a/src/einspline/multi_bspline_eval_std_c.h
+++ b/src/einspline/multi_bspline_eval_std_c.h
--- a/src/einspline/multi_bspline_eval_std_c_cpp.cc
+++ b/src/einspline/multi_bspline_eval_std_c_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std_c_impl.h"
--- a/src/einspline/multi_bspline_eval_std_c_impl.h
+++ b/src/einspline/multi_bspline_eval_std_c_impl.h
--- a/src/einspline/multi_bspline_eval_std_d.c
+++ b/src/einspline/multi_bspline_eval_std_d.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std_d_impl.h"
--- a/src/einspline/multi_bspline_eval_std_d.h
+++ b/src/einspline/multi_bspline_eval_std_d.h
--- a/src/einspline/multi_bspline_eval_std_d_cpp.cc
+++ b/src/einspline/multi_bspline_eval_std_d_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std_d_impl.h"
--- a/src/einspline/multi_bspline_eval_std_d_impl.h
+++ b/src/einspline/multi_bspline_eval_std_d_impl.h
--- a/src/einspline/multi_bspline_eval_std_s.c
+++ b/src/einspline/multi_bspline_eval_std_s.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std_s_impl.h"
--- a/src/einspline/multi_bspline_eval_std_s.h
+++ b/src/einspline/multi_bspline_eval_std_s.h
--- a/src/einspline/multi_bspline_eval_std_s_cpp.cc
+++ b/src/einspline/multi_bspline_eval_std_s_cpp.cc
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std_s_impl.h"
--- a/src/einspline/multi_bspline_eval_std_s_impl.h
+++ b/src/einspline/multi_bspline_eval_std_s_impl.h
--- a/src/einspline/multi_bspline_eval_std_z.c
+++ b/src/einspline/multi_bspline_eval_std_z.c
@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////
+//  einspline:  a library for creating and evaluating B-splines            //
+//  Copyright (C) 2007 Kenneth P. Esler, Jr.                               //
+//                                                                         //
+//  This program is free software; you can redistribute it and/or modify   //
+//  it under the terms of the GNU General Public License as published by   //
+//  the Free Software Foundation; either version 2 of the License, or      //
+//  (at your option) any later version.                                    //
+//                                                                         //
+//  This program is distributed in the hope that it will be useful,        //
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of         //
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          //
+//  GNU General Public License for more details.                           //
+//                                                                         //
+//  You should have received a copy of the GNU General Public License      //
+//  along with this program; if not, write to the Free Software            //
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor,                     //
+//  Boston, MA  02110-1301  USA                                            //
+/////////////////////////////////////////////////////////////////////////////
+
+#include "multi_bspline_eval_std_z_impl.h"
--- a/Show More
+++ b/Show More