Merge branch 'rebuild-master' into LAPW

Conflicts: src/QMCWaveFunctions/TrialWaveFunction.cpp
2017-03-23 22:15:09 -05:00 · 2017-03-23 22:15:09 -05:00 · 3b3febf5be
parent f7237bc367 edf7e1508b
commit 3b3febf5be
37 changed files with 1791 additions and 253 deletions
--- a/CMake/ctest_script.cmake
+++ b/CMake/ctest_script.cmake
@ -210,6 +210,10 @@ IF ( QMC_MIXED_PRECISION )
  SET( CTEST_OPTIONS "${CTEST_OPTIONS};-DQMC_MIXED_PRECISION='${QMC_MIXED_PRECISION}'" )
 ENDIF()

+IF ( BUILD_AFQMC )
+  SET( CTEST_OPTIONS "${CTEST_OPTIONS};-DBUILD_AFQMC='${BUILD_AFQMC}'" )
+ENDIF()
+
 IF ( BLA_VENDOR )
  SET( CTEST_OPTIONS "${CTEST_OPTIONS};-DBLA_VENDOR=${BLA_VENDOR}" )
 ENDIF()
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -407,7 +407,8 @@ ELSE(CMAKE_TOOLCHAIN_FILE)
  #-------------------------------------------------------------------
  IF(QMC_MPI)
    ## mpi compilers
-    if($ENV{CXX} MATCHES "mp")
+    GET_FILENAME_COMPONENT(BASE_CXX_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME)
+    if(${BASE_CXX_COMPILER_NAME} MATCHES "^mp")
      SET(HAVE_MPI 1)
      SET(HAVE_OOMPI 1)
      SET(MPI_FOUND TRUE)
@ -419,6 +420,7 @@ ELSE(CMAKE_TOOLCHAIN_FILE)
      include(FindMPI)
      IF(MPI_FOUND)
        MESSAGE(STATUS "Found mpi.h and libraries. Check for working mpi compiler ")
+        INCLUDE_DIRECTORIES(${MPI_CXX_INCLUDE_PATH})
        FILE(WRITE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/testmpi.cxx 
          "#include <mpi.h>"
          "int main(int argc, char** argv){MPI_Init(&argc,&argv); MPI_Finalize();return 0;}\n")
@ -426,18 +428,25 @@ ELSE(CMAKE_TOOLCHAIN_FILE)
        #${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/testmpi.cxx
        #OUTPUT_VARIABLE OUTPUT)
        set(MPI_WORKS 1)
+        SET(MPI_WARNING_LIST
+            "Building MPI version without using MPI compiler wrappers.\n"
+            "This may not build qmcpack correctly. To ensure the correct version, specify the compiler wrappers to cmake.\n"
+            "For example: cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++\n"
+            "To build without MPI, pass '-DQMC_MPI=0' to cmake")
+        MESSAGE(WARNING ${MPI_WARNING_LIST})
+
        IF(MPI_WORKS)
          MESSAGE(STATUS "MPI is enabled")
          SET(HAVE_MPI 1)
          SET(HAVE_OOMPI 1)
-          #LINK_LIBRARIES(${MPI_LIBRARY})
+          LINK_LIBRARIES(${MPI_CXX_LIBRARIES})
        ELSE(MPI_WORKS)
          MESSAGE(STATUS "MPI is disabled")
          SET(HAVE_MPI 0)
          SET(HAVE_OOMPI 0)
        ENDIF(MPI_WORKS)
      ENDIF(MPI_FOUND)
-    endif($ENV{CXX} MATCHES "mp")
+    ENDIF()
  ENDIF(QMC_MPI)

  #-------------------------------------------------------------------
@ -519,10 +528,9 @@ IF(CXX11_NEEDED)
  ENDIF()
 ENDIF(CXX11_NEEDED)

-# AFQMC requires MKL sparse
+# AFQMC requires MKL sparse for good performance (roughly a factor of 2x)
 IF (BUILD_AFQMC AND NOT MKL_FOUND)
-  MESSAGE(STATUS "AFQMC requires MKL sparse libraries. Disabling AFQMC")
-  SET (BUILD_AFQMC 0)
+  MESSAGE(WARNING "AFQMC - MKL not found, using simple sparse matrix routines.  Link with MKL sparse libraries for better performance.")
 ENDIF()

 # AFQMC requires MPI
@ -534,7 +542,7 @@ If (BUILD_AFQMC AND QMC_COMPLEX)
  MESSAGE(STATUS "Warning: Complex wavefunctions are not yet supported by AFQMC.")
 ENDIF()

-IF (BUILD_AFQMC AND CMAKE_COMPILER_IS_GNUCXX AND MKL_FOUND)
+IF (BUILD_AFQMC AND NOT APPLE)
  LINK_LIBRARIES("rt")
 ENDIF()
  
--- a/manual/dmc.tex
+++ b/manual/dmc.tex
@ -34,7 +34,7 @@
   &   \texttt{sigmaBound          } &  double  & $\ge 0$  & 10   & parameter to cutoff large weights  \\
   &   \texttt{killnode            } &  string  & yes/other & no   & kill or reject walkers that cross nodes  \\
  % &   \texttt{benchmark           } &  string  & $\ge 0$ & 0   & number of sample \\
-   &   \texttt{reconfiguration     } &  string  & yes/pure/other & no   & fixed population t:qechnique  \\
+   &   \texttt{reconfiguration     } &  string  & yes/pure/other & no   & fixed population technique  \\
   &   \texttt{branchInterval      } &  integer  & $\ge 0$ & 1   & branching interval \\
   &   \texttt{substeps            } &  integer  & $\ge 0$ & 1   & branching interval \\
   &   \texttt{nonlocalmoves       } &  string  & yes/other & no   & run with tmoves  \\
@ -79,7 +79,7 @@ where $N$ is the current population.

 \item \texttt{refEnergy}. The default reference energy is taken from the VMC run that precedes the DMC run. This value is updated to the current mean whenever branching happens.

-\item \texttt{feedback}. Variable used to determine how strong to react to population fluctutations when doing population control.  See the equation in energyUpdateInterval for more details.
+\item \texttt{feedback}. Variable used to determine how strong to react to population fluctuations when doing population control.  See the equation in energyUpdateInterval for more details.

 \item \texttt{useBareTau}. The same time step is used whether a move is rejected to not. The default is to use an effective time step when a move is rejected.

@ -98,7 +98,7 @@ where $N$ is the current population.
 \item \texttt{substeps}.  Same as BranchInterval.


-\item \texttt{nonlocalmoves}. DMC driver for running Hamiltonians with non-local moves.  An typical usage is to simulate Hamitonians with non-local psuedopotentials with T-Moves.  Setting this equal to false will impose the locality approximation.
+\item \texttt{nonlocalmoves}. DMC driver for running Hamiltonians with non-local moves.  An typical usage is to simulate Hamiltonians with non-local psuedopotentials with T-Moves.  Setting this equal to false will impose the locality approximation.

 \item \texttt{scaleweight}. Scaling weight per Umrigar/Nightengale.  CUDA only.

@ -160,22 +160,22 @@ on diffusion Monte Carlo.


 \begin{lstlisting}[caption=The following is an example of running a simulation that can be restarted . ]
-  <qmc method="dmc" move="pbyp"  checkpoint="0" dumpconfig="5">
+  <qmc method="dmc" move="pbyp"  checkpoint="0">
    <parameter name="timestep">         0.004  </parameter>
    <parameter name="blocks">           100   </parameter>
    <parameter name="steps">            400    </parameter>
  </qmc>
 \end{lstlisting}
-The flags checkpoint and dumpconfig instructs qmcpack to output walker configurations.  This also
-works in variational Monte Carlo.  This will output an h5 file with the name "projectid"."run-number".config.h5.
+The checkpoint flag instructs qmcpack to output walker configurations.  This also
+works in Variational Monte Carlo.  This will output an h5 file with the name "projectid"."run-number".config.h5.
 Check that this file exists before attempting a restart.
 To read in this file for a continuation run, specify the following:
-\begin{lstlisting}[caption=Restart (read wakers from previous run) ]
+\begin{lstlisting}[caption=Restart (read walkers from previous run) ]
 <mcwalkerset fileroot="BH.s002" version="0 6" collected="yes"/>
 \end{lstlisting}
-where, BH is the project id and s002 is the calculation number to read in the walkers from the previous run.\\
+BH is the project id and s002 is the calculation number to read in the walkers from the previous run.\\

-Combining VMC and DMC in a single run (and wave function optimization can be combined in this way too) is the standard way in which QMCPACK is typical run.   There is no need to run two separate jobs, as method sections can be stacked, and walkers are transfered between them.
+Combining VMC and DMC in a single run (and wave function optimization can be combined in this way too) is the standard way in which QMCPACK is typical run.   There is no need to run two separate jobs, as method sections can be stacked, and walkers are transferred between them.

 \begin{lstlisting}[caption=Combined VMC and DMC run ]
  <qmc method="vmc" move="pbyp" target="e">
--- a/manual/methods.tex
+++ b/manual/methods.tex
@ -24,8 +24,8 @@
   &   \texttt{move}          &  text              &   pbyp, alle     & pbyp              & method used to move electrons \\
   &   \texttt{gpu}           &  text              &   yes, no        & dep.              & use the GPU\\
   &   \texttt{trace}         &  text              &                  & no                & ???                      \\
-   &   \texttt{checkpoints}   &  integer           &   -1, 0, n       & -1                & checkpoint frequency \\
-   &   \texttt{dumpconfig}    &  integer           &   n              & -1                & dump configuration every n step  \\
+   &   \texttt{checkpoint}   &  integer           &   -1, 0, n       & -1                & checkpoint frequency \\
+   &   \texttt{record}      &  integer           &   n              & 0                & save configuration every n steps  \\
   &   \texttt{target}        &  text              &                  &                   & ???  \\
   &   \texttt{completed}     &  text              &                  &                   & ???  \\
   &   \texttt{append}        &  text              &   yes, no        & yes               & ???  \\
@ -39,38 +39,59 @@

 Additional information:
 \begin{itemize}
-\item \texttt{move}. There are two ways implemented to move electrons. The more used method is the particle-by-particle move. In this method, only one electron is moved for acception or rejection. The other method is the all-electron move, namely all the electrons are moved once for testing acception or rejection.
+\item \texttt{move}. There are two ways implemented to move electrons. The more used method is the particle-by-particle move. In this method, only one electron is moved for acceptance or rejection. The other method is the all-electron move, namely all the electrons are moved once for testing acceptance or rejection.

 \item \texttt{gpu}. When the executable is compiled with CUDA, the target computing device can be chosen by this switch. With a regular CPU only compilation, this option is not effective.

-\item \texttt{checkpoints}. If Checkpoint=``-1'' no checkpoint will be done (default setting). If Checkpoint=``0'' dump after the completion of a qmc section. When dumconfig=``n'' is present with Checkpoint=``0'', the configurations will be dumped at the end of the run (due to Checkpoint=``0''), but also at every n block. If Checkpoint=``n'', configurations will be dump every n block. 
+\item \texttt{checkpoint}.
+Enable or disable checkpointing and specify the frequency of output.  Possible values are:
+\begin{description}
+\item [-1] No checkpoint (default setting).
+\item [0] Dump after the completion of a qmc section.
+\item [n] Dump after every $n$ blocks.  Also dump at the end of the run.
+\end{description}
+%\item  ``-1'' No checkpoint (default setting).
+%\item ``0'' Dump after the completion of a qmc section.
+%\item ``n'' Dump after every $n$ blocks.  Also dump at the end of the run.
+%\end{itemize}
+%If Checkpoint=``-1'' no checkpoint will be done (default setting). If Checkpoint=``0'' dump after the completion of a qmc section. When dumconfig=``n'' is present with Checkpoint=``0'', the configurations will be dumped at the end of the run (due to Checkpoint=``0''), but also at every n block. If Checkpoint=``n'', configurations will be dump every n block. 

-All the dumped data will be written in a *.config.h5. The config.h5 file will contain the state of a population to continue a run including the random number sequences; the list of what is included in the .config.h5 is: number of walkers, status of the run, branch mode, energy dataset, ratio to accepted moves, ratio to proposed moves, variance dataset, vParam\{tau, taueff. E\_trial, E\_ref, Branch\_Max, BranchCutOff, BranchFilter, Sigma, Accepted\_Energy, Accepted\_Samples\}, IParam{warmumSteps, Energy\_Update\_Interval, Counter, targetwalkers, Maxwalkers, MinWalkers, Branching Interval}, Walker coordinates, Random number size, Random number sequence, version of the code.
+% TODO: Fill in more information about checkpoint/restart
+
+The particle configurations will be written to a \texttt{.config.h5} file.
+
+%Other sections of data needed for
+%restart include the random generator state, written to a \texttt{.random.xml} file.
+
+
+
+
+% I think this description refers to a previous version of the config.h5 file.  See HDFWalkerInput0.cpp.
+%All the dumped data will be written in a \texttt{*.config.h5} file. The \texttt{config.h5} file will contain the state of a population to continue a run. The list of what is included in the .config.h5 is: number of walkers, status of the run, branch mode, energy dataset, ratio to accepted moves, ratio to proposed moves, variance dataset, vParam\{tau, taueff. E\_trial, E\_ref, Branch\_Max, BranchCutOff, BranchFilter, Sigma, Accepted\_Energy, Accepted\_Samples\}, IParam{warmumSteps, Energy\_Update\_Interval, Counter, targetwalkers, Maxwalkers, MinWalkers, Branching Interval}, Walker coordinates, Random number size, Random number sequence, version of the code.

 \begin{lstlisting}[caption=The following is an example of running a simulation that can be restarted . ]
-  <qmc method="dmc" move="pbyp"  checkpoint="0" dumpconfig="5">
+  <qmc method="dmc" move="pbyp"  checkpoint="0"">
    <parameter name="timestep">         0.004  </parameter>
    <parameter name="blocks">           100   </parameter>
    <parameter name="steps">            400    </parameter>
  </qmc>
 \end{lstlisting}

-In this case, the there will be 
-The flags checkpoint and dumpconfig instructs qmcpack to output walker configurations.  This also
-works in variational Monte Carlo.  This will output an h5 file with the name "projectid"."run-number".config.h5.
+The checkpoint flag instructs qmcpack to output walker configurations.  This also
+works in Variational Monte Carlo.  This will output an h5 file with the name "projectid"."run-number".config.h5.
 Check that this file exists before attempting a restart.

-To continue a run, specify the following before your VM/DMC block:
-\begin{lstlisting}[caption=Restart (read wakers from previous run) ]
+To continue a run, specify the \texttt{mcwalkerset} element before your VMC/DMC block:
+\begin{lstlisting}[caption=Restart (read walkers from previous run) ]
 <mcwalkerset fileroot="BH.s002" version="0 6" collected="yes"/>
-  <qmc method="dmc" move="pbyp"  checkpoint="0" dumpconfig="5">
+  <qmc method="dmc" move="pbyp"  checkpoint="0">
    <parameter name="timestep">         0.004  </parameter>
    <parameter name="blocks">           100   </parameter>
    <parameter name="steps">            400    </parameter>
  </qmc>
 \end{lstlisting}
-where, BH is the project id and s002 is the calculation number to read in the walkers from the previous run.
+BH is the project id and s002 is the calculation number to read in the walkers from the previous run.

-In the project id section, make sure that the series number is different than any existing one to avoid rewriting on it. 
+In the project id section, make sure that the series number is different than any existing one to avoid overwriting it. 

 \end{itemize}
--- a/manual/output_overview.tex
+++ b/manual/output_overview.tex
@ -60,3 +60,14 @@ This file contains information from the trial wavefunction about the band struct
 including the available $k$-points. This can
 be helpful in constructing trial wavefunctions.

+
+\section{Checkpoint and restart files}
+\subsection{The .cont.xml file}
+This file enables continuation of the run.  It is mostly a copy of the input XML file with the series number incremented, and the \texttt{mcwalkerset} element added to read the walkers from a config file.   The \texttt{.cont.xml} file is always created, but other files it depends on are only present if checkpointing is enabled.
+
+\subsection{The .config.h5 file}
+This file contains stored walker configurations.
+
+\subsection{The .random.xml file}
+This file contains the state of the random number generator to allow restarts.
+
--- a/nexus/library/nexus.py
+++ b/nexus/library/nexus.py
@ -25,6 +25,7 @@
 import os

 from generic import obj
+from developer import error

 from nexus_base      import NexusCore,nexus_core,nexus_noncore,nexus_core_noncore
 from machines        import Job,job,Machine,Supercomputer,get_machine
@ -386,3 +387,36 @@ def run_project(*args,**kwargs):
    pm.add_simulations(*args,**kwargs)
    pm.run_project()
 #end def run_project
+
+
+
+
+
+
+
+
+# read input function
+#   place here for now as it depends on all other input functions
+def read_input(filepath,format=None):
+    if not os.path.exists(filepath):
+        error('cannot read input file\nfile does not exist: {0}'.format(filepath),'read_input')
+    #end if
+    if format is None:
+        if filepath.endswith('in.xml'):
+            format = 'qmcpack'
+        else:
+            error('cannot identify file format\nplease provide format for file: {0}'.format(filepath))
+        #end if
+    #end if
+    format = format.lower()
+    if format=='qmcpack':
+        input = QmcpackInput(filepath)
+    elif format=='pwscf':
+        input = PwscfInput(filepath)
+    elif format=='gamess':
+        input = GamessInput(filepath)
+    else:
+        error('cannot read input file\nfile format "{0}" is unsupported'.format(format))
+    #end if
+    return input
+#end def read_input
--- a/nexus/library/plotting.py
+++ b/nexus/library/plotting.py
@ -13,13 +13,13 @@

 from developer import unavailable
 try:
-    from matplotlib.pyplot import figure,plot,xlabel,ylabel,title,show,ylim,legend,xlim,rcParams,savefig,bar,xticks,subplot,grid,setp,errorbar,loglog,semilogx,semilogy,text
+    from matplotlib.pyplot import figure,plot,xlabel,ylabel,title,show,ylim,legend,xlim,rcParams,savefig,bar,xticks,yticks,subplot,grid,setp,errorbar,loglog,semilogx,semilogy,text

    params = {'legend.fontsize':14,'figure.facecolor':'white','figure.subplot.hspace':0.,
          'axes.labelsize':16,'xtick.labelsize':14,'ytick.labelsize':14}
    rcParams.update(params)
 except (ImportError,RuntimeError):
-   figure,plot,xlabel,ylabel,title,show,ylim,legend,xlim,rcParams,savefig,bar,xticks,subplot,grid,setp,errorbar,loglog,semilogx,semilogy,text = unavailable('matplotlib.pyplot','figure','plot','xlabel','ylabel','title','show','ylim','legend','xlim','rcParams','savefig','bar','xticks','subplot','grid','setp','errorbar','loglog','semilogx','semilogy','text')
+   figure,plot,xlabel,ylabel,title,show,ylim,legend,xlim,rcParams,savefig,bar,xticks,yticks,subplot,grid,setp,errorbar,loglog,semilogx,semilogy,text = unavailable('matplotlib.pyplot','figure','plot','xlabel','ylabel','title','show','ylim','legend','xlim','rcParams','savefig','bar','xticks','yticks','subplot','grid','setp','errorbar','loglog','semilogx','semilogy','text')
 #end try


--- a/nexus/library/qmcpack.py
+++ b/nexus/library/qmcpack.py
@ -106,7 +106,9 @@ class Qmcpack(Simulation):

    def pre_write_inputs(self,save_image):
        # fix to make twist averaged input file under generate_only
-        if nexus_core.generate_only:
+        if self.system is None:
+            self.should_twist_average = False
+        elif nexus_core.generate_only:
            twistnums = range(len(self.system.structure.kpoints))
            if self.should_twist_average:
                self.twist_average(twistnums)
--- a/nexus/library/qmcpack_input.py
+++ b/nexus/library/qmcpack_input.py
@ -517,7 +517,8 @@ class QIxml(Names):
            attr_types  = None,
            precision   = None,
            defaults    = obj(),
-            collection_id = None
+            collection_id = None,
+            exp_names   = None,
            )
        for v in ['attributes','elements','parameters','attribs','costs','h5tags']:
            names = cls.class_get(v)
@ -535,6 +536,9 @@ class QIxml(Names):
            #end if
        #end for
        cls.plurals = cls.plurals_inv.inverse()
+        if cls.exp_names is not None:
+            cls.expanded_names = obj(Names.expanded_names,cls.exp_names)
+        #end if
    #end def init_class


@ -1958,6 +1962,7 @@ class coefficients(QIxml):
    attributes = ['id','type','optimize','state','size','cusp','rcut']
    text       = 'coeff'
    write_types= obj(optimize=yesno)
+    exp_names  = obj(array='Array')
 #end class coefficients

 class coefficient(QIxml):  # this is bad!!! coefficients/coefficient
@ -2187,6 +2192,12 @@ class gofr(QIxml):
    identifier = 'name'
 #end class gofr

+class flux(QIxml):
+    tag = 'estimator'
+    attributes = ['type','name']
+    identifier = 'name'
+#end class flux
+    
 estimator = QIxmlFactory(
    name  = 'estimator',
    types = dict(localenergy         = localenergy,
@ -2206,6 +2217,7 @@ estimator = QIxmlFactory(
                 sk                  = sk,
                 skall               = skall,
                 gofr                = gofr,
+                 flux                = flux,
                 ),
    typekey  = 'type',
    typekey2 = 'name'
@ -2435,7 +2447,7 @@ classes = [   #standard classes
    multideterminant,detlist,ci,mcwalkerset,csf,det,
    optimize,cg_optimizer,flex_optimizer,optimize_qmc,wftest,kspace_jastrow,
    header,local,force,forwardwalking,observable,record,rmc,pressure,dmccorrection,
-    nofk,mpc_est,distancetable,cpp,element,spline,setparams,
+    nofk,mpc_est,flux,distancetable,cpp,element,spline,setparams,
    backflow,transformation,cubicgrid,molecular_orbital_builder,cmc,sk,skall,gofr,
    host,date,user,
    ]
@ -2510,7 +2522,7 @@ Names.set_expanded_names(
    elecelec         = 'ElecElec',
    pseudopot        = 'PseudoPot',
    posarray         = 'posArray',
-    array            = 'Array',
+    #array            = 'Array',  # handle separately, namespace collision
    atomicbasisset   = 'atomicBasisSet',
    basisgroup       = 'basisGroup',
    expandylm        = 'expandYlm',
@ -4041,21 +4053,6 @@ def generate_particlesets(electrons = 'e',
    net_charge = system.net_charge
    net_spin   = system.net_spin

-    # not ordered consistently
-    #structure.group_atoms()
-    #elem = structure.elem
-    #pos  = structure.pos
-    #
-    #elns = particles.get_electrons()
-    #ions = particles.get_ions()
-    #eup  = elns.up_electron
-    #edn  = elns.down_electron
-
-    # maintain consistent order
-    ion_species,ion_counts = structure.order_by_species()
-    elem = structure.elem
-    pos  = structure.pos
-
    elns = particles.get_electrons()
    ions = particles.get_ions()
    eup  = elns.up_electron
@ -4072,6 +4069,10 @@ def generate_particlesets(electrons = 'e',
        )
    particlesets.append(eps)
    if len(ions)>0:
+        # maintain consistent order
+        ion_species,ion_counts = structure.order_by_species()
+        elem = structure.elem
+        pos  = structure.pos
        if randomsrc:
            eps.randomsrc = iname
        #end if
@ -4514,6 +4515,12 @@ def generate_hamiltonian(name         = 'h0',
                #end if
            elif not isinstance(estimator,QIxml):
                QmcpackInput.class_error('generate_hamiltonian received an invalid estimator\n  an estimator must either be a name or a QIxml object\n  inputted estimator type: {0}\n  inputted estimator contents: {1}'.format(estimator.__class__.__name__,estimator))
+            elif isinstance(estimator,energydensity):
+                est.set_optional(
+                    type = 'EnergyDensity',
+                    dynamic = ename,
+                    static  = iname,
+                    )
            elif isinstance(estimator,dm1b):
                dm = estimator
                reuse = False
@ -5049,6 +5056,121 @@ def count_jastrow_params(jastrows):
 #end def count_jastrow_params


+def generate_energydensity(
+        name      = None,
+        dynamic   = None,
+        static    = None,
+        coord     = None,
+        grid      = None,
+        scale     = None,
+        ion_grids = None,
+        system    = None,
+        ):
+    if dynamic is None:
+        dynamic = 'e'
+    #end if
+    if static is None:
+        static = 'ion0'
+    #end if
+    refp = None
+    sg = []
+    if coord is None:
+        QmcpackInput.class_error('coord must be provided','generate_energydensity')
+    elif coord=='voronoi':
+        if name is None:
+            name = 'EDvoronoi'
+        #end if
+        sg.append(spacegrid(coord=coord))
+    elif coord=='cartesian':
+        if name is None:
+            name = 'EDcell'
+        #end if
+        if grid is None:
+            QmcpackInput.class_error('grid must be provided for cartesian coordinates','generate_energydensity')
+        #end if
+        axes = [
+            axis(p1='a1',scale='.5',label='x'),
+            axis(p1='a2',scale='.5',label='y'),
+            axis(p1='a3',scale='.5',label='z'),
+            ]
+        n=0
+        for ax in axes:
+           ax.grid = '-1 ({0}) 1'.format(grid[n])
+           n+=1
+        #end for
+        sg.append(spacegrid(coord=coord,origin=origin(p1='zero'),axes=axes))
+    elif coord=='spherical':
+        if name is None:
+            name = 'EDatom'
+        #end if
+        if ion_grids is None:
+            QmcpackInput.class_error('ion_grids must be provided for spherical coordinates','generate_energydensity')
+        #end if
+        refp = reference_points(coord='cartesian',points='\nr1 1 0 0\nr2 0 1 0\nr3 0 0 1\n')
+        if system is None:
+            i=1
+            for scale,g1,g2,g3 in ion_grids:
+                grid = g1,g2,g3
+                axes = [
+                    axis(p1='r1',scale=scale,label='r'),
+                    axis(p1='r2',scale=scale,label='phi'),
+                    axis(p1='r3',scale=scale,label='theta'),
+                    ]
+                n=0
+                for ax in axes:
+                    ax.grid = '0 ({0}) 1'.format(grid[n])
+                    n+=1
+                #end for
+                sg.append(spacegrid(coord=coord,origin=origin(p1=static+str(i)),axes=axes))
+                i+=1
+            #end for
+        else:
+            ig = ion_grids
+            ion_grids = obj()
+            for e,s,g1,g2,g3 in ig:
+                ion_grids[e] = s,(g1,g2,g3)
+            #end for
+            missing = set(ion_grids.keys())
+            i=1
+            for e in system.structure.elem:
+                if e in ion_grids:
+                    scale,grid = ion_grids[e]
+                    axes = [
+                        axis(p1='r1',scale=scale,label='r'),
+                        axis(p1='r2',scale=scale,label='phi'),
+                        axis(p1='r3',scale=scale,label='theta'),
+                        ]
+                    n=0
+                    for ax in axes:
+                        ax.grid = '0 ({0}) 1'.format(grid[n])
+                        n+=1
+                    #end for
+                    sg.append(spacegrid(coord=coord,origin=origin(p1=static+str(i)),axes=axes))
+                    if e in missing:
+                        missing.remove(e)
+                    #end if
+                #end if
+                i+=1
+            #end for
+            if len(missing)>0:
+                QmcpackInput.class_error('ion species not found for spherical grid\nspecies not found: {0}\nspecies present: {1}'.format(sorted(missing),sorted(set(list(system.structure.elem)))),'generate_energydensity')
+            #end if
+        #end if
+    else:
+        QmcpackInput.class_error('unsupported coord type\ncoord type provided: {0}\nsupported coord types: voronoi, cartesian, spherical'.format(coord),'generate_energydensity')
+    #end if
+    ed = energydensity(
+        type       = 'EnergyDensity',
+        name       = name,
+        dynamic    = dynamic,
+        static     = static,
+        spacegrids = sg,
+        )
+    if refp is not None:
+        ed.reference_points = refp
+    #end if
+    return ed
+#end def generate_energydensity


 opt_map = dict(linear=linear,cslinear=cslinear)
--- a/nexus/library/simulation.py
+++ b/nexus/library/simulation.py
@ -292,6 +292,16 @@ class Simulation(NexusCore):
        inp_args = obj()
        sim_args.transfer_from(kwargs,sim_kw)
        inp_args.transfer_from(kwargs,inp_kw)
+        if 'system' in inp_args:
+            system = inp_args.system
+            if not isinstance(system,PhysicalSystem):
+                extra=''
+                if not isinstance(extra,obj):
+                    extra = '\nwith value: {0}'.format(system)
+                #end if
+                cls.class_error('invalid input for variable "system"\nsystem object must be of type PhysicalSystem\nyou provided type: {0}'.format(system.__class__.__name__)+extra)
+            #end if
+        #end if
        if 'pseudos' in inp_args and inp_args.pseudos!=None:
            pseudos = inp_args.pseudos
            # support ppset labels
@ -320,9 +330,6 @@ class Simulation(NexusCore):
            #end if
            if 'system' in inp_args:
                system = inp_args.system
-                if not isinstance(system,PhysicalSystem):
-                    cls.class_error('system object must be of type PhysicalSystem')
-                #end if
                species_labels,species = system.structure.species(symbol=True)
                pseudopotentials = nexus_core.pseudopotentials
                for ppfile in pseudos:
@ -417,6 +424,8 @@ class Simulation(NexusCore):
    def init_job(self):
        if self.job==None:
            self.error('job not provided.  Input field job must be set to a Job object.')
+        elif not isinstance(self.job,Job):
+            self.error('Input field job must be set to a Job object\nyou provided an object of type: {0}\nwith value: {1}'.format(self.job.__class__.__name__,self.job))
        #end if
        self.job = self.job.copy()
        self.job.initialize(self)
--- a/nexus/library/structure.py
+++ b/nexus/library/structure.py
@ -29,7 +29,7 @@
 import os
 from copy import deepcopy
 from random import randint
-from numpy import array,floor,empty,dot,diag,sqrt,pi,mgrid,exp,append,arange,ceil,cross,cos,sin,identity,ndarray,atleast_2d,around,ones,zeros,logical_not,flipud,uint64
+from numpy import array,floor,empty,dot,diag,sqrt,pi,mgrid,exp,append,arange,ceil,cross,cos,sin,identity,ndarray,atleast_2d,around,ones,zeros,logical_not,flipud,uint64,sign
 from numpy.linalg import inv,det,norm
 from types import NoneType
 from unit_converter import convert
@ -62,25 +62,23 @@ except (ImportError,RuntimeError):
 #   cif file support in Nexus currently requires two external libraries
 #     PyCifRW  - base interface to read cif files into object format: CifFile
 #     cif2cell - translation layer from CifFile object to cell reconstruction: CellData
+#     (note: cif2cell installation includes PyCifRW)
 #
-#   Nexus is currently compatible with PyCifRW-4.1.1 and cif2cell-1.2.7
-#     compatibility last tested: 7 Aug 2015
-# 
-#   installation of PyCifRW
-#     go to https://pypi.python.org/pypi/PyCifRW/4.1
-#     scroll down to bottom of page, download PyCifRW-4.1.1.tar.gz (md5) 
-#     unpack PyCifRW-4.1.1.tar.gz (tar -xzf PyCifRW-4.1.1.tar.gz)
-#     enter directory (cd PyCifRW-4.1.1)
-#     install with python (python setup.py install)  (sudo python setup.py install)
-#     check python installation (python;  from CifFile import CifFile)
-#
-#  "installation" of cif2cell
+#  installation of cif2cell
 #    go to http://sourceforge.net/projects/cif2cell/
-#    click on Download (example: cif2cell-1.2.7.tar.gz)
-#    unpack directory (tar -xzf cif2cell-1.2.7.tar.gz)
-#    add directory to PYTHONPATH in .bashrc after Nexus (export PYTHONPATH=/your/path/to/nexus/library:/your/other/path/to/cif2cell-1.2.7)
-#    apparently an actual install of cif2cell (python setup.py install) will also install PyCifRW, so above install might be redundant
+#    click on Download (example: cif2cell-1.2.10.tar.gz)
+#    unpack directory (tar -xzf cif2cell-1.2.10.tar.gz)
+#    enter directory (cd cif2cell-1.2.10)
+#    install cif2cell (python setup.py install)
+#    check python installation
+#      >python
+#      >>>from CifFile import CifFile
+#      >>>from uctools import CellData
 #   
+#   Nexus is currently compatible with
+#     cif2cell-1.2.10 and PyCifRW-3.3
+#     cif2cell-1.2.7  and PyCifRW-4.1.1 
+#     compatibility last tested: 20 Mar 2017
 #
 try:
    from CifFile import CifFile
@ -198,7 +196,7 @@ def read_cif(filepath,block=None,grammar='1.1',cell='prim',args_only=False):

 # installation instructions for spglib interface
 #
-#  this is bootstrapped of of spglib's ASE Python interface
+#  this is bootstrapped off of spglib's ASE Python interface
 #
 #  installation of spglib
 #    go to http://sourceforge.net/projects/spglib/files/
@ -396,6 +394,266 @@ def rotate_plane(plane,angle,points,units='degrees'):



+opt_tm_matrices    = obj()
+opt_tm_wig_indices = obj()
+
+def trivial_filter(T):
+    return True
+#end def trival_filter
+
+class MaskFilter(DevBase):
+    def set(self,mask,dim=3):
+        omask = array(mask)
+        mask  = array(mask,dtype=bool)
+        if mask.size==dim:
+            mvec = mask.ravel()
+            mask = empty((dim,dim),dtype=bool)
+            i=0
+            for mi in mvec:
+                j=0
+                for mj in mvec:
+                    mask[i,j] = mi==mj
+                    j+=1
+                #end for
+                i+=1
+            #end for
+        elif mask.shape!=(dim,dim):
+            error('shape of mask array must be {0},{0}\nshape received: {1},{2}\nmask array received: {3}'.format(dim,mask.shape[0],mask.shape[1],omask),'optimal_tilematrix')
+        #end if
+        self.mask = mask==False
+    #end def set
+
+    def __call__(self,T):
+        return (T[self.mask]==0).all()
+    #end def __call__
+#end class MaskFilter
+mask_filter = MaskFilter()
+            
+
+def optimal_tilematrix(axes,volfac,dn=1,tol=1e-3,filter=trivial_filter,mask=None,nc=5):
+    if mask is not None:
+        mask_filter.set(mask)
+        filter = mask_filter
+    #end if
+    dim = 3
+    if isinstance(axes,Structure):
+        axes = axes.axes
+    else:
+        axes = array(axes,dtype=float)
+    #end if
+    if not isinstance(volfac,int):
+        volfac = int(around(volfac))
+    #end if
+    volume = abs(det(axes))*volfac
+    axinv  = inv(axes)
+    cube   = volume**(1./3)*identity(dim)
+    Tref   = array(around(dot(cube,axinv)),dtype=int)
+    # calculate and store all tiling matrix variations
+    if dn not in opt_tm_matrices:
+        mats = []
+        rng = tuple(range(-dn,dn+1))
+        for n1 in rng:
+            for n2 in rng:
+                for n3 in rng:
+                    for n4 in rng:
+                        for n5 in rng:
+                            for n6 in rng:
+                                for n7 in rng:
+                                    for n8 in rng:
+                                        for n9 in rng:
+                                            mats.append((n1,n2,n3,n4,n5,n6,n7,n8,n9))
+                                        #end for
+                                    #end for
+                                #end for
+                            #end for
+                        #end for
+                    #end for
+                #end for
+            #end for
+        #end for
+        mats = array(mats,dtype=int)
+        mats.shape = (2*dn+1)**(dim*dim),dim,dim
+        opt_tm_matrices[dn] = mats
+    else:
+        mats = opt_tm_matrices[dn]
+    #end if
+    # calculate and store all wigner image indices
+    if nc not in opt_tm_wig_indices:
+        inds = []
+        rng = tuple(range(-nc,nc+1))
+        for k in rng:
+            for j in rng:
+                for i in rng:
+                    if i!=0 or j!=0 or k!=0:
+                        inds.append((i,j,k))
+                    #end if
+                #end for
+            #end for
+        #end for
+        inds = array(inds,dtype=int)
+        opt_tm_wig_indices[nc] = inds
+    else:
+        inds = opt_tm_wig_indices[nc]
+    #end if
+    # track counts of tiling matrices
+    ntilings        = len(mats)
+    nequiv_volume   = 0
+    nfilter         = 0
+    nequiv_inscribe = 0
+    nequiv_wigner   = 0
+    nequiv_cubicity = 0
+    nequiv_shape    = 0
+    # try a faster search for cells w/ target volume
+    det_inds_p = [
+        [(0,0),(1,1),(2,2)],
+        [(0,1),(1,2),(2,0)],
+        [(0,2),(1,0),(2,1)]
+        ]
+    det_inds_m = [
+        [(0,0),(1,2),(2,1)],
+        [(0,1),(1,0),(2,2)],
+        [(0,2),(1,1),(2,0)]
+        ]
+    volfacs = zeros((len(mats),),dtype=int)
+    for (i1,j1),(i2,j2),(i3,j3) in det_inds_p:
+        volfacs += (Tref[i1,j1]+mats[:,i1,j1])*(Tref[i2,j2]+mats[:,i2,j2])*(Tref[i3,j3]+mats[:,i3,j3])
+    #end for
+    for (i1,j1),(i2,j2),(i3,j3) in det_inds_m:
+        volfacs -= (Tref[i1,j1]+mats[:,i1,j1])*(Tref[i2,j2]+mats[:,i2,j2])*(Tref[i3,j3]+mats[:,i3,j3])
+    #end for
+    Tmats = mats[abs(volfacs)==volfac]
+    nequiv_volume = len(Tmats)    
+    # find the set of cells with maximal inscribing radius
+    inscribe_tilings = []
+    rmax = -1e99
+    for mat in Tmats:
+        T = Tref + mat
+        if filter(T):
+            nfilter+=1
+            Taxes = dot(T,axes)
+            rc1 = norm(cross(Taxes[0],Taxes[1]))
+            rc2 = norm(cross(Taxes[1],Taxes[2]))
+            rc3 = norm(cross(Taxes[2],Taxes[0]))
+            r   = 0.5*volume/max(rc1,rc2,rc3) # inscribing radius
+            if r>rmax or abs(r-rmax)<tol:
+                inscribe_tilings.append((r,T,Taxes))
+                rmax = r
+            #end if
+        #end if
+    #end for
+    # find the set of cells w/ maximal wigner radius out of the inscribing set
+    wigner_tilings = []
+    rwmax = -1e99
+    for r,T,Taxes in inscribe_tilings:
+        if abs(r-rmax)<tol:
+            nequiv_inscribe+=1
+            rw = 1e99
+            for ind in inds:
+                rw = min(rw,0.5*norm(dot(ind,Taxes)))
+            #end for
+            if rw>rwmax or abs(rw-rwmax)<tol:
+                wigner_tilings.append((rw,T,Taxes))
+                rwmax = rw
+            #end if
+        #end if
+    #end for
+    # find the set of cells w/ maximal cubicity
+    # (minimum cube_deviation)
+    cube_tilings = []            
+    cmin = 1e99
+    for rw,T,Ta in wigner_tilings:
+        if abs(rw-rwmax)<tol:
+            nequiv_wigner+=1
+            dc = volume**(1./3)*sqrt(2.)
+            d1 = abs(norm(Ta[0]+Ta[1])-dc)
+            d2 = abs(norm(Ta[1]+Ta[2])-dc)
+            d3 = abs(norm(Ta[2]+Ta[0])-dc)
+            d4 = abs(norm(Ta[0]-Ta[1])-dc)
+            d5 = abs(norm(Ta[1]-Ta[2])-dc)
+            d6 = abs(norm(Ta[2]-Ta[0])-dc)
+            cube_dev = (d1+d2+d3+d4+d5+d6)/(6*dc)
+            if cube_dev<cmin or abs(cube_dev-cmin)<tol:
+                cube_tilings.append((cube_dev,rw,T,Ta))
+                cmin = cube_dev
+            #end if
+        #end if
+    #end for
+    # prioritize selection by "shapeliness" of tiling matrix
+    #   prioritize positive diagonal elements
+    #   penalize off-diagonal elements
+    #   penalize negative off-diagonal elements
+    shapely_tilings = []
+    smax = -1e99
+    for cd,rw,T,Taxes in cube_tilings:
+        if abs(cd-cmin)<tol:
+            nequiv_cubicity+=1
+            d = diag(T)
+            o = (T-diag(d)).ravel()
+            s = sign(d).sum()-(abs(o)>0).sum()-(o<0).sum()
+            if s>smax or abs(s-smax)<tol:
+                shapely_tilings.append((s,rw,T,Taxes))
+                smax = s
+            #end if
+        #end if
+    #end for
+    # prioritize selection by symmetry of tiling matrix
+    ropt   = -1e99
+    Topt   = None
+    Taxopt = None
+    diagonal      = []
+    symmetric     = []
+    antisymmetric = []
+    other         = []
+    for s,rw,T,Taxes in shapely_tilings:
+        if abs(s-smax)<tol:
+            nequiv_shape+=1
+            Td = diag(diag(T))
+            if abs(Td-T).sum()==0:
+                diagonal.append((rw,T,Taxes))
+            elif abs(T.T-T).sum()==0:
+                symmetric.append((rw,T,Taxes))
+            elif abs(T.T+T-2*Td).sum()==0:
+                antisymmetric.append((rw,T,Taxes))
+            else:
+                other.append((rw,T,Taxes))
+            #end if
+        #end if
+    #end for
+    s = 1
+    if len(diagonal)>0:
+        cells = diagonal
+    elif len(symmetric)>0:
+        cells = symmetric
+    elif len(antisymmetric)>0:
+        cells = antisymmetric
+        s = -1
+    elif len(other)>0:
+        cells = other
+    #end if
+    skew_min = 1e99
+    if len(cells)>0:
+        for rw,T,Taxes in cells:
+            Td = diag(diag(T))
+            skew = abs(T.T-s*T-(1-s)*Td).sum()
+            if skew<skew_min:
+                ropt = rw
+                Topt = T
+                Taxopt = Taxes
+                skew_min = skew
+            #end if
+        #end for
+    #end if
+    if Taxopt is None:
+        error('optimal tilematrix for volfac={0} not found with tolerance {1}\ndifference range (dn): {2}\ntiling matrices searched: {3}\ncells with target volume: {4}\ncells that passed the filter: {5}\ncells with equivalent inscribing radius: {6}\ncells with equivalent wigner radius: {7}\ncells with equivalent cubicity: {8}\nmatrices with equivalent shapeliness: {9}\nplease try again with dn={10}'.format(volfac,tol,dn,ntilings,nequiv_volume,nfilter,nequiv_inscribe,nequiv_wigner,nequiv_cubicity,nequiv_shape,dn+1))
+    #end if
+    if det(Taxopt)<0:
+        Topt = -Topt
+    #end if
+    return Topt,ropt
+#end def optimal_tilematrix
+
+
+

 class Sobj(DevBase):
    None
@ -932,6 +1190,19 @@ class Structure(Sobj):
        return self.rinscribe()
    #end def rcell

+    # scale invariant measure of deviation from cube shape
+    #   based on deviation of face diagonals from cube
+    def cube_deviation(self):
+        a = self.axes
+        dc = self.volume()**(1./3)*sqrt(2.)
+        d1 = abs(norm(a[0]+a[1])-dc)
+        d2 = abs(norm(a[1]+a[2])-dc)
+        d3 = abs(norm(a[2]+a[0])-dc)
+        d4 = abs(norm(a[0]-a[1])-dc)
+        d5 = abs(norm(a[1]-a[2])-dc)
+        d6 = abs(norm(a[2]-a[0])-dc)
+        return (d1+d2+d3+d4+d5+d6)/(6*dc)
+    #end def cube_deviation
    
    # apply volume preserving shear-removing transformations to cell axes
    #   resulting unsheared cell has orthogonal axes
@ -2725,13 +2996,13 @@ class Structure(Sobj):
    #end def tile_points


-    def opt_tilematrix(self,volfac,dn=1,tol=1e-3):
-        return optimal_tilematrix(self,volfac,dn,tol)
+    def opt_tilematrix(self,*args,**kwargs):
+        return optimal_tilematrix(self.axes,*args,**kwargs)
    #end def opt_tilematrix


-    def tile_opt(self,volfac,dn=1,tol=1e-3):
-        Topt,ropt = self.opt_tilematrix(volfac,dn,tol)
+    def tile_opt(self,*args,**kwargs):
+        Topt,ropt = self.opt_tilematrix(*args,**kwargs)
        return self.tile(Topt)
    #end def tile_opt

@ -3798,6 +4069,8 @@ class Structure(Sobj):
                elif t0=='atom':
                    pos.append(tokens[1:4])
                    elem.append(tokens[4])
+                elif t0.startswith('initial'):
+                    None
                else:
                    #None
                    self.error('unrecogonized or not yet supported token in fhi-aims geometry file: {0}'.format(t0))
@ -4841,7 +5114,8 @@ class Jellium(Structure):
    prefactors.transfer_from({1:2*pi,2:4*pi,3:4./3*pi})

    def __init__(self,charge=None,background_charge=None,cell=None,volume=None,density=None,rs=None,dim=3,
-                 axes=None,kpoints=None,kweights=None,kgrid=None,kshift=None,units=None):
+                 axes=None,kpoints=None,kweights=None,kgrid=None,kshift=None,units=None,tiling=None):
+        del tiling
        if rs!=None:
            if not dim in self.prefactors:
                self.error('only 1,2, or 3 dimensional jellium is currently supported\n  you requested one with dimension {0}'.format(dim))
@ -4934,7 +5208,7 @@ def generate_structure(type='crystal',*args,**kwargs):
    elif type=='basic':
        s = Structure(*args,**kwargs)
    else:
-        Structure.class_error(str(type)+' is not a valid structure type\n  options are crystal, defect, atom, dimer, trimer, jellium, empty, or basic')
+        Structure.class_error(str(type)+' is not a valid structure type\noptions are crystal, defect, atom, dimer, trimer, jellium, empty, or basic')
    #end if
    return s
 #end def generate_structure
@ -5218,79 +5492,6 @@ def read_structure(filepath,elem=None,format=None):



-opt_tm_matrices = obj()
-
-def optimal_tilematrix(axes,volfac,dn=1,tol=1e-3):
-    dim = 3
-    if isinstance(axes,Structure):
-        axes = axes.axes
-    else:
-        axes = array(axes,dtype=float)
-    #end if
-    volume = abs(det(axes))*volfac
-    axinv  = inv(axes)
-    cube   = volume**(1./3)*identity(dim)
-    Tref   = array(around(dot(cube,axinv)),dtype=int)
-    rng = tuple(range(-dn,dn+1))
-    if dn not in opt_tm_matrices:
-        mats = []
-        for n1 in rng:
-            for n2 in rng:
-                for n3 in rng:
-                    for n4 in rng:
-                        for n5 in rng:
-                            for n6 in rng:
-                                for n7 in rng:
-                                    for n8 in rng:
-                                        for n9 in rng:
-                                            mats.append((n1,n2,n3,n4,n5,n6,n7,n8,n9))
-                                        #end for
-                                    #end for
-                                #end for
-                            #end for
-                        #end for
-                    #end for
-                #end for
-            #end for
-        #end for
-        mats = array(mats,dtype=int)
-        mats.shape = (2*dn+1)**(dim*dim),dim,dim
-        opt_tm_matrices[dn] = mats
-    else:
-        mats = opt_tm_matrices[dn]
-    #end if
-    ropt = -1e99
-    Topt = None
-    Taxopt = None
-    vol_diff_min = 1e99
-    for mat in mats:
-        T = Tref + mat
-        vol_diff = abs(abs(det(T))-volfac)
-        if vol_diff < vol_diff_min:
-            vol_diff_min = vol_diff
-        # end if
-        if vol_diff<tol:
-            Taxes = dot(T,axes)
-            rc1 = norm(cross(Taxes[0],Taxes[1]))
-            rc2 = norm(cross(Taxes[1],Taxes[2]))
-            rc3 = norm(cross(Taxes[2],Taxes[0]))
-            r   = 0.5*volume/max(rc1,rc2,rc3)
-            if r>ropt:
-                ropt = r
-                Topt = T
-                Taxopt = Taxes
-            #end if
-        #end if
-    #end for
-    if Taxopt is None:
-        error("optimal tilematrix for volfac=%4.2f not found with tolerance %5.4f\n minimum volume difference was %5.4f" % (volfac,tol,vol_diff_min) )
-    # end if
-    if det(Taxopt)<0:
-        Topt = -Topt
-    #end if
-    return Topt,ropt
-#end def optimal_tilematrix
-

 #if __name__=='__main__':
 #    from numpy.random import rand
@ -5329,15 +5530,19 @@ def optimal_tilematrix(axes,volfac,dn=1,tol=1e-3):


 if __name__=='__main__':
-    a = convert(5.639,'A','B')
+    large = generate_structure(
+        type      = 'crystal',
+        structure = 'diamond',
+        cell      = 'fcc',
+        atoms     = 'Ge',
+        constants = 5.639,
+        units     = 'A',
+        tiling    = (2,2,2),
+        kgrid     = (1,1,1),
+        kshift    = (0,0,0),
+        )
    
-    large = generate_structure('diamond','fcc','Ge',(2,2,2),a)
-    small = generate_structure('diamond','fcc','Ge',(1,1,1),a)
-
-    large.add_kmesh((1,1,1))
-
-    kmap = large.fold(small)
+    small = large.folded_structure

    print small.kpoints_unit()
-
 #end if
--- a/src/AFQMC/CMakeLists.txt
+++ b/src/AFQMC/CMakeLists.txt
@ -31,3 +31,9 @@ SET (AFQMC_SRCS
 INCLUDE_DIRECTORIES(/usr/gapps/qmc/libs/INTEL/eigen-eigen-10219c95fe65/)
 ADD_LIBRARY(afqmc ${AFQMC_SRCS})

+IF (BUILD_UNIT_TESTS)
+  INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/external_codes/catch)
+  SUBDIRS(Matrix/tests)
+  SUBDIRS(Numerics/tests)
+ENDIF()
+
--- a/src/AFQMC/Hamiltonians/SparseGeneralHamiltonian.cpp
+++ b/src/AFQMC/Hamiltonians/SparseGeneralHamiltonian.cpp
@ -205,16 +205,17 @@ namespace qmcplusplus
        return false;
      }

-      if(factorizedHamiltonian)
+      if(factorizedHamiltonian) {
        if(H1.size() != nOne || hij_core.size() != nOne_core) {
          app_error()<<"Error after readElementsFromFCIDUMP in readFCIDUMP, wrong number of elements.\n" <<std::endl;
          return false;
        }
-      else
+      } else {
        if(H1.size() != nOne || hij_core.size() != nOne_core  || V2.size()!=nTwo ||  Vijkl_core.size()!=nTwo_core ||  Vijkl_mixed.size()!=nTwo_mixed) {
          app_error()<<"Error after readElementsFromFCIDUMP in readFCIDUMP, wrong number of elements.\n" <<std::endl;
          return false;
        }
+      }

      Timer.stop("Generic");
      if(rnk==0) app_log()<<" -- Time to read ASCII integral file: " <<Timer.average("Generic") <<"\n";
@ -3936,8 +3937,8 @@ namespace qmcplusplus

      v.push_back(Vijkl);
      if( jilk != ijkl ) v.push_back(std::make_tuple(j,i,l,k,V));  
-      if( klij != ijkl && klij != jilk ) v.push_back(std::make_tuple(k,l,i,j,std::conj(V)));   
-      if( lkji != ijkl && lkji != jilk && lkji != klij ) v.push_back(std::make_tuple(l,k,j,i,std::conj(V)));   
+      if( klij != ijkl && klij != jilk ) v.push_back(std::make_tuple(k,l,i,j,myconj(V)));
+      if( lkji != ijkl && lkji != jilk && lkji != klij ) v.push_back(std::make_tuple(l,k,j,i,myconj(V)));
      // order, not needed but good measure 
      std::sort (v.begin(), v.end(),mySort);
    } else {
@ -3994,13 +3995,13 @@ namespace qmcplusplus

        // need to add klij as conj(V). How do I add it? 
        if(klij != lkji) {  // add once with factor of 2
-          v.push_back(std::make_tuple(k,l,i,j,std::conj(static_cast<RealType>(2.0)*V)));   
+          v.push_back(std::make_tuple(k,l,i,j,myconj(static_cast<RealType>(2.0)*V)));
          if(ijkl == lkji)  {
            std::cerr<<" Error in find_equivalent_OneBar_for_hamiltonian_generation: Not sure how you got here. (ijkl == lkji): " <<i <<" " <<j <<" " <<k <<" " <<l <<"  " <<V  <<std::endl;
            APP_ABORT("Error in find_equivalent_OneBar_for_hamiltonian_generation: Not sure how you got here. (ijkl == lkji) \n"); 
          }
        } else 
-          v.push_back(std::make_tuple(k,l,i,j,std::conj(V)));
+          v.push_back(std::make_tuple(k,l,i,j,myconj(V)));

      } else {
        // just checking
@ -5113,14 +5114,14 @@ namespace qmcplusplus
          } 
          case 1: 
          { 
-            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug ???!!! \n" <<std::endl; 
+            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug !!! \n" <<std::endl; 
            return false;  
            APP_ABORT("");
            break;
          } 
          case 2: 
          { 
-            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug ???!!! \n" <<std::endl; 
+            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug !!! \n" <<std::endl; 
            return false;  
            APP_ABORT("");
            break;
@ -5499,14 +5500,14 @@ namespace qmcplusplus
          } 
          case 1: 
          { 
-            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug ???!!! \n" <<std::endl; 
+            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug !!! \n" <<std::endl; 
            return false;  
            APP_ABORT("");
            break;
          } 
          case 2: 
          { 
-            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug ???!!! \n" <<std::endl; 
+            app_error()<<"Error in SparseGeneralHamiltonian::createHamiltonianForPureDeterminant(). Found sector 1 term in V2_2bar. Wy???? Bug Bug Bug !!! \n" <<std::endl; 
            return false;  
            APP_ABORT("");
            break;
@ -6345,6 +6346,7 @@ namespace qmcplusplus
      }
    }

+    return true;
  }

 /*
--- a/src/AFQMC/Matrix/SparseMatrix.h
+++ b/src/AFQMC/Matrix/SparseMatrix.h
@ -37,15 +37,15 @@ class SparseMatrix
  typedef typename std::vector<T>::const_iterator const_iterator;
  typedef SparseMatrix<T>  This_t;

-  SparseMatrix<T>():vals(),colms(),myrows(),rowIndex(),nr(0),nc(0),compressed(false),zero_based(true)
+  SparseMatrix<T>():vals(),colms(),myrows(),rowIndex(),nr(0),nc(0),compressed(false),zero_based(true),storage_format(0)
  {
  }

-  SparseMatrix<T>(int n):vals(),colms(),myrows(),rowIndex(),nr(n),nc(n),compressed(false),zero_based(true)
+  SparseMatrix<T>(int n):vals(),colms(),myrows(),rowIndex(),nr(n),nc(n),compressed(false),zero_based(true),storage_format(0)
  {
  }

-  SparseMatrix<T>(int n,int m):vals(),colms(),myrows(),rowIndex(),nr(n),nc(m),compressed(false),zero_based(true) 
+  SparseMatrix<T>(int n,int m):vals(),colms(),myrows(),rowIndex(),nr(n),nc(m),compressed(false),zero_based(true),storage_format(0)
  {
  }

@ -63,6 +63,7 @@ class SparseMatrix
    myrows=rhs.myrows;
    colms=rhs.colms;
    rowIndex=rhs.rowIndex;
+    storage_format=rhs.storage_format;
  }

  inline void reserve(int n)
@ -182,15 +183,23 @@ class SparseMatrix
  }  

  inline int find_element(int i, int j) {
-    return 0;
+    for (int k = rowIndex[i]; k < rowIndex[i+1]; k++) {
+      if (colms[k] == j) return k;
+    }
+    return -1;
  }

+  // DANGER: This returns a reference, which could allow changes to the stored value.
+  // If a zero element is changed, it will change zero everywhere in the matrix.
+  // For now this method is only used for testing so it should not be a problem.
  inline Type_t& operator()(int i, int j)
  {
 #ifdef ASSERT_SPARSEMATRIX
    assert(i>=0 && i<nr && j>=0 && j<nc && compressed); 
 #endif
-    return vals[find_element(i,j)]; 
+    int idx = find_element(i,j);
+    if (idx == -1) return zero;
+    return vals[idx];
  }

  inline Type_t operator()( int i, int j) const
@ -198,7 +207,9 @@ class SparseMatrix
 #ifdef ASSERT_SPARSEMATRIX
    assert(i>=0 && i<nr && j>=0 && j<nc && compressed); 
 #endif
-    return vals[find_element(i,j)]; 
+    int idx = find_element(i,j);
+    if (idx == -1) return 0;
+    return vals[idx];
  }

  inline void add(const int i, const int j, const T& v, bool dummy=false) 
@ -421,11 +432,7 @@ class SparseMatrix
    rowIndex.resize(nr+1);
    rowIndex[0]=0;
    for(int i=0; i<V.size(); i++) {
-      if( std::is_same<T,std::complex<double> >::value  ) {
-        vals[i] = std::complex<double>(std::get<1>(V[i]),0.0);
-      } else {
-       vals[i] = static_cast<T>(std::get<1>(V[i]));
-      }
+      vals[i] = static_cast<T>(std::get<1>(V[i]));
      myrows[i] = 0; 
      colms[i] = std::get<0>(V[i]); 
 #ifdef ASSERT_SPARSEMATRIX
@ -505,8 +512,10 @@ class SparseMatrix
        for(int i=old+1; i<=curr; i++) rowIndex[i] = n;
      }
    }
-    for(int i=myrows.back()+1; i<rowIndex.size(); i++)
-      rowIndex[i] = vals.size();
+    if (myrows.size() > 0) {
+      for(int i=myrows.back()+1; i<rowIndex.size(); i++)
+        rowIndex[i] = vals.size();
+    }
    compressed=true;
  }

@ -525,11 +534,7 @@ class SparseMatrix
    colms.resize(nnz);
    rowIndex.resize(nr+1);
    for(int i=0; i<V.size(); i++) {
-      if( std::is_same<T,std::complex<double> >::value  ) {
-        vals[i] = std::complex<double>(std::get<2>(V[i]),0.0);
-      } else {
-       vals[i] = static_cast<T>(std::get<2>(V[i]));
-      }
+      vals[i] = static_cast<T>(std::get<2>(V[i]));
      myrows[i] = std::get<0>(V[i]);
      colms[i] = std::get<1>(V[i]);
 #ifdef ASSERT_SPARSEMATRIX
@ -545,8 +550,10 @@ class SparseMatrix
        for(int i=old+1; i<=curr; i++) rowIndex[i] = n;
      }
    }
-    for(int i=myrows.back()+1; i<rowIndex.size(); i++)
-      rowIndex[i] = vals.size();
+    if (myrows.size() > 0) {
+      for(int i=myrows.back()+1; i<rowIndex.size(); i++)
+        rowIndex[i] = vals.size();
+    }
    compressed=true;
  }

@ -673,6 +680,7 @@ class SparseMatrix
  bool zero_based;
  int storage_format; // 0: CSR, 1: Compressed Matrix (ESSL) 
  int max_in_row;
+  Type_t zero; // zero for return value

  _mySort_snD_ my_sort;

--- a/src/AFQMC/Matrix/tests/CMakeLists.txt
+++ b/src/AFQMC/Matrix/tests/CMakeLists.txt
@ -0,0 +1,28 @@
+#//////////////////////////////////////////////////////////////////////////////////////
+#// This file is distributed under the University of Illinois/NCSA Open Source License.
+#// See LICENSE file in top directory for details.
+#//
+#// Copyright (c) 2017 Jeongnim Kim and QMCPACK developers.
+#//
+#// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+#//
+#// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+#//////////////////////////////////////////////////////////////////////////////////////
+
+MESSAGE("Adding this unit test")
+
+
+SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${QMCPACK_UNIT_TEST_DIR})
+
+SET(SRC_DIR afqmc_matrix)
+SET(UTEST_EXE test_${SRC_DIR})
+SET(UTEST_NAME unit_test_${SRC_DIR})
+
+
+ADD_EXECUTABLE(${UTEST_EXE} test_sparse_matrix.cpp)
+#TARGET_LINK_LIBRARIES(${UTEST_EXE} qmcutil ${QMC_UTIL_LIBS})
+
+#ADD_TEST(NAME ${UTEST_NAME} COMMAND "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}")
+ADD_UNIT_TEST(${UTEST_NAME} "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}")
+SET_TESTS_PROPERTIES(${UTEST_NAME} PROPERTIES LABELS "unit;afqmc")
+
--- a/src/AFQMC/Matrix/tests/sparse_matrix_helpers.h
+++ b/src/AFQMC/Matrix/tests/sparse_matrix_helpers.h
@ -0,0 +1,71 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2017 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by:  Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//
+// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AFQMC_SPARSE_MATRIX_HELPER_H
+#define AFQMC_SPARSE_MATRIX_HELPER_H
+
+#include "AFQMC/Matrix/SparseMatrix.h"
+
+#include <stdio.h>
+#include <string>
+#include <complex>
+
+using std::string;
+using std::complex;
+
+namespace qmcplusplus
+{
+
+template<typename T> void output_data(T *data, int size)
+{
+  std::cout << "[ ";
+  for (int i= 0; i < size; i++)
+  {
+    std::cout << data[i] << " ";
+  }
+  std::cout << "]" << std::endl;
+}
+
+template<typename T> void output_matrix(SparseMatrix<T> &M)
+{
+  std::cout << "Colms = ";
+  output_data(M.column_data(), M.size());
+  std::cout << "Row index = ";
+  output_data(M.row_index(), M.rows()+1);
+  std::cout << "Values = ";
+  output_data(M.values(), M.size());
+}
+
+template <typename T> double realPart(T &a) { REQUIRE(false); return 0.0; }
+
+template<> double realPart(const double &a)
+{
+  return a;
+}
+
+template<> double realPart(double &a)
+{
+  return a;
+}
+
+template<> double realPart(const complex<double> &a)
+{
+  return a.real();
+}
+
+template<> double realPart(complex<double> &a)
+{
+  return a.real();
+}
+
+}
+
+#endif
--- a/src/AFQMC/Matrix/tests/test_sparse_matrix.cpp
+++ b/src/AFQMC/Matrix/tests/test_sparse_matrix.cpp
@ -0,0 +1,197 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2017 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by:  Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//
+// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+#include "Configuration.h"
+#include "AFQMC/Matrix/SparseMatrix.h"
+
+#include "AFQMC/Matrix/tests/sparse_matrix_helpers.h"
+#include <stdio.h>
+#include <string>
+#include <complex>
+
+
+using std::string;
+using std::complex;
+
+namespace qmcplusplus
+{
+
+template<typename T> void test_init_one_D()
+{
+  SparseMatrix<T> M(1,1);
+
+  // s1D is tuple<uint32_t, T>
+  std::vector< s1D<T> > I;
+  I.push_back(s1D<T>(0, 1.0));
+  M.initFroms1D(I,true);
+
+  int idx = M.find_element(0, 0);
+  REQUIRE(idx == 0);
+  const T &val = M(0,0);
+  REQUIRE(realPart(val) == Approx(1.0));
+}
+
+TEST_CASE("sparse_matrix_init_one_D", "[sparse_matrix]")
+{
+  test_init_one_D<double>();
+  test_init_one_D<complex<double>>();
+}
+
+template<typename T> void test_init_one_D_4()
+{
+  SparseMatrix<T> M(1,4);
+
+  // s1D is tuple<uint32_t, T>
+  std::vector< s1D<T> > I;
+  I.push_back(s1D<T>(1, 4.0));
+  I.push_back(s1D<T>(2, 2.0));
+  M.initFroms1D(I,true);
+
+  //output_matrix(M);
+
+  int idx = M.find_element(0, 0);
+  REQUIRE(idx == -1);
+  idx = M.find_element(0, 1);
+  REQUIRE(idx == 0);
+  idx = M.find_element(0, 2);
+  REQUIRE(idx == 1);
+  idx = M.find_element(0, 3);
+  REQUIRE(idx == -1);
+
+  const T &val0 = M(0,0);
+  REQUIRE(realPart(val0) == Approx(0.0));
+  const T &val1 = M(0,1);
+  REQUIRE(realPart(val1) == Approx(4.0));
+  const T &val2 = M(0,2);
+  REQUIRE(realPart(val2) == Approx(2.0));
+  const T &val3 = M(0,3);
+  REQUIRE(realPart(val3) == Approx(0.0));
+}
+
+TEST_CASE("sparse_matrix_init_one_D_4", "[sparse_matrix]")
+{
+  test_init_one_D_4<double>();
+  test_init_one_D_4<complex<double> >();
+}
+
+
+template<typename T> void test_init_two_D_4()
+{
+  SparseMatrix<T> M(1,4);
+
+  // s2D is tuple<uint32_t, T>
+  std::vector< s2D<T> > I;
+  I.push_back(s2D<T>(0, 0, 4.0));
+  M.initFroms2D(I,true);
+
+  //output_matrix(M);
+
+  int idx = M.find_element(0, 0);
+  REQUIRE(idx == 0);
+
+  T val0 = M(0,0);
+  REQUIRE(realPart(val0) == Approx(4.0));
+}
+
+TEST_CASE("sparse_matrix_init_two_D_4", "[sparse_matrix]")
+{
+  test_init_two_D_4<double>();
+  test_init_two_D_4<complex<double>>();
+}
+
+template<typename T> void test_matrix_init_two_D_22()
+{
+  SparseMatrix<T> M(2,2);
+
+  // s1D is tuple<uint32_t, uint32_t, T>
+  std::vector< s2D<T> > I;
+  I.push_back(s2D<T>(0, 1, 4.0));
+  I.push_back(s2D<T>(1, 1, 2.0));
+  M.initFroms2D(I,true);
+
+  //output_matrix(M);
+
+  int idx = M.find_element(0, 0);
+  REQUIRE(idx == -1);
+  idx = M.find_element(0, 1);
+  REQUIRE(idx == 0);
+  idx = M.find_element(1, 0);
+  REQUIRE(idx == -1);
+  idx = M.find_element(1, 1);
+  REQUIRE(idx == 1);
+
+  T &val0 = M(0,0);
+  REQUIRE(realPart(val0) == Approx(0.0));
+  T &val1 = M(0,1);
+  REQUIRE(realPart(val1) == Approx(4.0));
+  T &val2 = M(1,0);
+  REQUIRE(realPart(val2) == Approx(0.0));
+  T &val3 = M(1,1);
+  REQUIRE(realPart(val3) == Approx(2.0));
+}
+
+TEST_CASE("sparse_matrix_init_two_D_2x2", "[sparse_matrix]")
+{
+  test_matrix_init_two_D_22<double>();
+  test_matrix_init_two_D_22<complex<double>>();
+}
+
+
+template<typename T> void test_matrix_invariant()
+{
+  for (int m = 1; m < 4; m++) {
+    for (int n = 1; n < 4; n++) {
+      std::vector< s2D<T> > I;
+      T *A = new T[m*n];
+      for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+          // Should use random selection of zero elements
+          if ((i+j)%3 == 0)
+          //if (false) // all non-zero (dense)
+          {
+            A[i*n+j] = 0;
+          } else {
+            T val = 1.0*(i+j+1);
+            I.push_back(s2D<T>(i, j, val));
+            A[i*n+j] = val;
+          }
+        }
+      }
+      SparseMatrix<T> M(m,n);
+      M.initFroms2D(I,true);
+#if 0
+      printf("m = %d  n = %d\n",m,n);
+      output_data(A, m*n);
+      printf("CSR:\n");
+      output_matrix(M);
+      printf("\n");
+#endif
+
+      for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+          REQUIRE(M(i,j) == A[i*n+j]);
+        }
+      }
+    }
+  }
+}
+
+TEST_CASE("sparse_matrix_invariant", "[sparse_matrix]")
+{
+  test_matrix_invariant<double>();
+  test_matrix_invariant<complex<double>>();
+}
+
+}
+
--- a/src/AFQMC/Numerics/SparseMatrixOperations.cpp
+++ b/src/AFQMC/Numerics/SparseMatrixOperations.cpp
@ -4,7 +4,9 @@
 #include<cassert>

 #include "AFQMC/config.h"
+#ifdef PRINT_FREE_MEMORY
 #include "sys/sysinfo.h"
+#endif

 #if defined(HAVE_MKL)
 #include "mkl.h"
@ -103,8 +105,9 @@ void product_SpMatV(const int M, const int K,
    const int* rows = A.row_index();
    for(int nr=0; nr<M; nr++,C++,rows++) {
      *C*=beta;
-      for(int i=*rows; i<*(rows+1); i++,val++,cols++)
+      for(int i=*rows; i<*(rows+1); i++,val++,cols++) {
        *C += alpha * (*val) * ( *( B + (*cols) + disp) );
+      }
    }
  } else  {  // EESL: Compressed Matrix 

@ -165,7 +168,14 @@ void product_SpMatV(const int M, const int K,
  matdes[3] = 'C';
  mkl_dcsrmv( &trans, &M, &K, &alpha, matdes, val , col,  row ,  row+1, B, &beta, C );
 #else
-APP_ABORT("ERROR: product_SpMatV only implemented with MKL. \n");
+  const int* cols = col;
+  int disp = 0;
+  const int* rows = row;
+  for(int nr=0; nr<M; nr++,C++,rows++) {
+    *C*=beta;
+    for(int i=*rows; i<*(rows+1); i++,val++,cols++)
+      *C += alpha * (*val) * ( *( B + (*cols) + disp) );
+  }
 #endif
 }

@ -185,7 +195,14 @@ void product_SpMatV(const int M, const int K,
  matdes[3] = 'C';
  mkl_zcsrmv( &trans, &M, &K, &alpha, matdes, val , col,  row ,  row+1, B, &beta, C );
 #else
-APP_ABORT("ERROR: product_SpMatV only implemented with MKL. \n");
+  const int* cols = col;
+  int disp = 0;
+  const int* rows = row;
+  for(int nr=0; nr<M; nr++,C++,rows++) {
+    *C*=beta;
+    for(int i=*rows; i<*(rows+1); i++,val++,cols++)
+      *C += alpha * (*val) * ( *( B + (*cols) + disp) );
+  }
 #endif
 }

@ -205,7 +222,18 @@ void product_SpMatTV(const int M, const int K,
  matdes[3] = 'C';
  mkl_zcsrmv( &trans, &M, &K, &alpha, matdes, val , col,  row ,  row+1, B, &beta, C );
 #else
-APP_ABORT("ERROR: product_SpMatV only implemented with MKL. \n");
+  const int* cols = col;
+  int disp = 0;
+  const int* rows = row;
+  for(int nr=0; nr<K; nr++)
+  {
+    C[nr] *= beta;
+  }
+  for(int nr=0; nr<M; nr++,rows++,B++) {
+    for(int i=*rows; i<*(rows+1); i++,val++,cols++) {
+      *(C + (*cols) + disp) += alpha * (*val) *  (*B);
+    }
+  }
 #endif
 }

@ -227,17 +255,19 @@ void product_SpMatTV(const int M, const int K,

 #else

-APP_ABORT("ERROR: product_SpMatTV only implemented with MKL. \n");
  ComplexType zero = ComplexType(0,0);
  const ComplexType* val = A.values();
  const int* cols = A.column_data();
  int disp = (A.zero_base())?0:-1;
  if( A.format() == 0) {  // CSR
    const int* rows = A.row_index();
-    for(int nr=0; nr<M; nr++,C++,rows++) {
-      *C*=beta;
-      for(int i=*rows; i<*(rows+1); i++,val++,cols++)
-        *C += alpha * (*val) * ( *( B + (*cols) + disp) );
+    for(int nr=0; nr<K; nr++) {
+      C[nr] *= beta;
+    }
+    for(int nr=0; nr<M; nr++,rows++,B++) {
+      for(int i=*rows; i<*(rows+1); i++,val++,cols++) {
+        *(C + (*cols) + disp) += alpha * (*val) *  (*B);
+      }
    }
  } else  {  // EESL: Compressed Matrix 

@ -263,7 +293,27 @@ void product_SpMatM(const int M, const int N, const int K,

 #else

-APP_ABORT("ERROR: product_SpMatM only implemented with MKL. \n");
+  ComplexType zero = ComplexType(0,0);
+  const ComplexType* val = A.values();
+  const int* cols = A.column_data();
+  int disp = (A.zero_base())?0:-1;
+  if( A.format() == 0) {  // CSR
+    const int* rows = A.row_index();
+    ComplexType *baseC = C;
+    const ComplexType *baseB = B;
+    for(int nr=0; nr<M; nr++,rows++,C+=ldc) {
+      for(int nn=0; nn<N; nn++) {
+        C[nn] *= beta;
+      }
+      for(int i=*rows; i<*(rows+1); i++,val++,cols++) {
+        for(int nn=0; nn<N; nn++) {
+          C[nn] += alpha*(*val) * ( *( B + nn + (*cols)*ldb + disp) );
+        }
+      }
+    }
+  } else  {  // EESL: Compressed Matrix
+
+  }

 #endif
 }
@ -285,7 +335,24 @@ void product_SpMatM(const int M, const int N, const int K,

 #else

-APP_ABORT("ERROR: product_SpMatM only implemented with MKL. \n");
+  const RealType* val = A.values();
+  const int* cols = A.column_data();
+  int disp = (A.zero_base())?0:-1;
+  if( A.format() == 0) {  // CSR
+    const int* rows = A.row_index();
+    for(int nr=0; nr<M; nr++,rows++,C+=ldc) {
+      for(int nn=0; nn<N; nn++) {
+        C[nn] *= beta;
+      }
+      for(int i=*rows; i<*(rows+1); i++,val++,cols++) {
+        for(int nn=0; nn<N; nn++) {
+          C[nn] += alpha*(*val) * ( *( B + nn + (*cols)*ldb + disp) );
+        }
+      }
+    }
+  } else  {  // EESL: Compressed Matrix
+
+  }

 #endif
 }
@ -307,7 +374,24 @@ void product_SpMatM(const int M, const int N, const int K,

 #else

-APP_ABORT("ERROR: product_SpMatM only implemented with MKL. \n");
+  const float* val = A.values();
+  const int* cols = A.column_data();
+  int disp = (A.zero_base())?0:-1;
+  if( A.format() == 0) {  // CSR
+    const int* rows = A.row_index();
+    for(int nr=0; nr<M; nr++,rows++,C+=ldc) {
+      for(int nn=0; nn<N; nn++) {
+        C[nn] *= beta;
+      }
+      for(int i=*rows; i<*(rows+1); i++,val++,cols++) {
+        for(int nn=0; nn<N; nn++) {
+          C[nn] += alpha*(*val) * ( *( B + nn + (*cols)*ldb + disp) );
+        }
+      }
+    }
+  } else  {  // EESL: Compressed Matrix
+
+  }

 #endif
 }
@ -468,10 +552,12 @@ bool sparseEigenSystem(RealSpMat &A, int& m0, RealType *eigval, RealType* eigVec
  std::cout<<"Problem size: " <<N <<std::endl;
  std::cout<<"Available memory: ";

+#ifdef PRINT_FREE_MEMORY
  struct sysinfo si;
  sysinfo(&si);
  si.freeram+=si.bufferram;
  std::cout<<int(si.freeram>>20) <<std::endl;
+#endif

  fpm[0] = 1;
  fpm[4] = 1;
@ -579,7 +665,7 @@ bool sparseEigenSystem(ComplexSpMat &A, int& m0, RealType *eigval, ComplexType*
  return true;

 #else
-APP_ABORT("Error: sparseEigenSystem only implemented with MKL library. n");
+//APP_ABORT("Error: sparseEigenSystem only implemented with MKL library. n");
  return false;
 #endif

@ -587,9 +673,15 @@ APP_ABORT("Error: sparseEigenSystem only implemented with MKL library. n");


 template
-void product_SpMatV<ComplexSpMat>(const int nrows, const ComplexSpMat& A, const ComplexType* B, ComplexType* C);
+void product_SpMatV<ComplexSpMat>(int nrows,
+                                  const ComplexSpMat &,
+                                  const ComplexType* B,
+                                  ComplexType* C);
 template
-void product_SpMatV<ComplexSMSpMat>(const int nrows, const ComplexSMSpMat& A, const ComplexType* B, ComplexType* C);
+void product_SpMatV<ComplexSMSpMat>(int nrows,
+                                    const ComplexSMSpMat& A,
+                                    const ComplexType* B,
+                                    ComplexType* C);

 template
 void product_SpMatV<ComplexSpMat>(const int M, const int K,
--- a/src/AFQMC/Numerics/SparseMatrixOperations.h
+++ b/src/AFQMC/Numerics/SparseMatrixOperations.h
@ -27,8 +27,8 @@ void product_SD(const IndexType K,

 template<class T>
 void product_SpMatV(int nrows,
-             T& A, 
-             ComplexType* B, 
+             const T& A,
+             const ComplexType* B,
             ComplexType* C );

 template<class T>
--- a/src/AFQMC/Numerics/tests/CMakeLists.txt
+++ b/src/AFQMC/Numerics/tests/CMakeLists.txt
@ -0,0 +1,28 @@
+#//////////////////////////////////////////////////////////////////////////////////////
+#// This file is distributed under the University of Illinois/NCSA Open Source License.
+#// See LICENSE file in top directory for details.
+#//
+#// Copyright (c) 2017 Jeongnim Kim and QMCPACK developers.
+#//
+#// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+#//
+#// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+#//////////////////////////////////////////////////////////////////////////////////////
+
+MESSAGE("Adding this unit test")
+
+
+SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${QMCPACK_UNIT_TEST_DIR})
+
+SET(SRC_DIR afqmc_numerics)
+SET(UTEST_EXE test_${SRC_DIR})
+SET(UTEST_NAME unit_test_${SRC_DIR})
+
+
+ADD_EXECUTABLE(${UTEST_EXE} test_sparse_ops.cpp)
+TARGET_LINK_LIBRARIES(${UTEST_EXE} ${QMC_UTIL_LIBS} afqmc)
+
+#ADD_TEST(NAME ${UTEST_NAME} COMMAND "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}")
+ADD_UNIT_TEST(${UTEST_NAME} "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}")
+SET_TESTS_PROPERTIES(${UTEST_NAME} PROPERTIES LABELS "unit;afqmc")
+
--- a/src/AFQMC/Numerics/tests/gensparse.py
+++ b/src/AFQMC/Numerics/tests/gensparse.py
@ -0,0 +1,146 @@
+
+# Generate sparse matrix multiply test cases
+
+import numpy as np
+import scipy.sparse as sps
+
+
+def flatten(a):
+  '''Flatten a nested list'''
+  return [item for d in a for item in d]
+
+def string_elements(a):
+  '''Convert elements of a list to strings'''
+  return [str(item) for item in a]
+
+def matrix_vector_mult():
+  A = np.eye(3)
+  X = np.ones((3,1))
+  Y = np.ones((3,1))
+  alpha = 1.0
+  beta = 2.0
+
+  output = generate_case(A, X, Y, alpha, beta, type='double')
+  output += generate_case(A, X, Y, alpha, beta, type='complex<double>')
+
+  A = np.eye(3)
+  A[1,2] = 2.0
+  X = np.arange(0,6).reshape(3,2)
+  Y = np.arange(0,6).reshape(3,2)
+  alpha = 1.0
+  beta = 2.0
+
+  output += '\n'
+  output += generate_case(A, X, Y, alpha, beta, type='double')
+  output += generate_case(A, X, Y, alpha, beta, type='complex<double>')
+
+  A = np.eye(3)
+  A[1,1] = 0.0
+  A[1,2] = 1.0
+  X = np.arange(0,9).reshape(3,3)
+  Y = np.arange(0,9).reshape(3,3)
+  alpha = 1.0
+  beta = 2.0
+
+  output += '\n'
+  output += generate_case(A, X, Y, alpha, beta, type='double')
+  output += generate_case(A, X, Y, alpha, beta, type='complex<double>')
+
+  A = np.eye(2)
+  A[0,1] = 1.0
+  X = np.arange(0,6).reshape(2,3)
+  Y = np.arange(0,6).reshape(2,3)
+  alpha = 1.0
+  beta = 2.0
+
+  output += '\n'
+  output += generate_case(A, X, Y, alpha, beta, type='double')
+  output += generate_case(A, X, Y, alpha, beta, type='complex<double>')
+
+  with open('sparse_mult_cases.cpp', 'w') as f:
+    f.write(output)
+
+
+case_id = 1
+def generate_case(A, X, Y, alpha, beta, type='double'):
+  global case_id
+
+  b = alpha*(A.dot(X)) + beta*Y
+
+  init_list = []
+  for k,v in sps.dok_matrix(A).iteritems():
+    init_list.append("s2Dd({i},{j},{v})".format(i=k[0],j=k[1],v=v))
+
+  x_init_list = flatten(X.tolist())
+  y_init_list = flatten(Y.tolist())
+  ans_init_list = flatten(b.tolist())
+
+  out = '''
+TEST_CASE("sparse_matrix_{test_case}", "[sparse_matrix]")
+{{
+  typedef s2D<{type}> s2Dd;
+
+  const int M = {M};
+  const int N = {N};
+  const int K = {K};
+
+  std::vector<s2Dd> I = {{{init_list}}};
+  SparseMatrix<{type}> A(M,K);
+  A.initFroms2D(I,false);
+
+  {type} B[K*N] = {{{x_init_list}}};
+  {type} C[K*N] = {{{y_init_list}}};
+  {type} alpha = {alpha};
+  {type} beta = {beta};
+
+  {type} finalC[K*N] = {{{ans_init_list}}};
+
+  int ldb = N;
+  int ldc = N;
+  SparseMatrixOperators::product_SpMatM(M, N, K, alpha, A, B, ldb, beta, C, ldc);
+
+  for (int i = 0; i < M; i++)
+  {{
+    for (int j = 0; j < N; j++)
+    {{
+      //printf("%d %d %g %g\\n",i,j,C[i*N + j],finalC[i*N + j]);
+      REQUIRE(realPart(C[i*N + j]) == Approx(realPart(finalC[i*N + j])));
+    }}
+  }}
+}}
+  '''.format(M=A.shape[0],
+             N=Y.shape[1],
+             K=A.shape[1],
+             init_list = ',\n    '.join(init_list),
+             test_case = str(case_id),
+             x_init_list = ','.join(string_elements(x_init_list)),
+             y_init_list = ','.join(string_elements(y_init_list)),
+             alpha = 1.0,
+             beta = 2.0,
+             ans_init_list = ','.join(string_elements(ans_init_list)),
+             type = type
+           )
+
+  case_id += 1
+
+  return out
+
+
+if False:
+  # Some experiments with CSR matrix format in scipy.sparse
+  # Could potentially generate tests for generation of CSR matrices
+  z = np.zeros((1,3))
+  print z
+  z[0,0] = 1.0
+  z[0,1] = 2.0
+  z[0,2] = 3.0
+
+  M = sps.csr_matrix(z)
+
+  print 'nnz = ', M.nnz
+  print 'colums = ',M.indices
+  print 'indptr = ',M.indptr
+  print 'values = ',M.data
+
+matrix_vector_mult()
+
--- a/src/AFQMC/Numerics/tests/sparse_mult_cases.cpp
+++ b/src/AFQMC/Numerics/tests/sparse_mult_cases.cpp
@ -0,0 +1,2 @@
+
+// Run gensparse.py to create this file with content
--- a/src/AFQMC/Numerics/tests/test_sparse_ops.cpp
+++ b/src/AFQMC/Numerics/tests/test_sparse_ops.cpp
@ -0,0 +1,390 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2017 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by:  Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//
+// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+#include "Configuration.h"
+
+// Always test the fallback code, regardless of MKL definition
+#undef HAVE_MKL
+
+// Avoid the need to link with other libraries just to get APP_ABORT
+#undef APP_ABORT
+#define APP_ABORT(x) std::cout << x; exit(0);
+
+#include "AFQMC/Matrix/SparseMatrix.h"
+#include "AFQMC/Matrix/tests/sparse_matrix_helpers.h"
+
+// Include the templates directly so all the needed types get instantiated
+//  and so the undef of HAVE_MKL has an effect
+#include "AFQMC/Numerics/SparseMatrixOperations.cpp"
+
+#include <stdio.h>
+#include <string>
+#include <complex>
+
+using std::string;
+using std::complex;
+using std::cout;
+
+
+namespace qmcplusplus
+{
+template<typename T> void test_matrix_mult_1_1()
+{
+  SparseMatrix<T> A(1,1);
+
+  std::vector<s1D<T>> I;
+  I.push_back(s1D<T>(0, 1.0));
+  A.initFroms1D(I,true);
+
+  T *B = new T[1];
+  T *C = new T[1];
+  B[0] = 1.0;
+  C[0] = 0.0;
+  int nrows = 1;
+  SparseMatrixOperators::product_SpMatV(nrows, A, B, C);
+
+  REQUIRE(realPart(C[0]) == Approx(1.0));
+}
+
+TEST_CASE("sparse_matrix_mult", "[sparse_matrix]")
+{
+  test_matrix_mult_1_1<complex<double>>();
+}
+
+TEST_CASE("sparse_matrix_mult_2x2", "[sparse_matrix]")
+{
+
+  SparseMatrix<complex<double>> A(2,2);
+
+  std::vector< s2D<complex<double>> > I;
+  I.push_back(s2D<complex<double>>(0,0, 2.0));
+  I.push_back(s2D<complex<double>>(0,1, 4.0));
+  // [[2.0 4.0]
+  //  [0.0 0.0]]
+  A.initFroms2D(I,true);
+
+
+  complex<double> *B = new complex<double>[2];
+  complex<double> *C = new complex<double>[2];
+  B[0] = 4.0;
+  B[1] = 3.0;
+  C[0] = 0.0;
+  C[1] = 0.0;
+  int nrows = 1;
+  SparseMatrixOperators::product_SpMatV(nrows, A, B, C);
+
+  REQUIRE(C[0].real() == Approx(20.0));
+  REQUIRE(C[0].imag() == Approx(0.0));
+
+  REQUIRE(C[1].real() == Approx(0.0));
+  REQUIRE(C[1].imag() == Approx(0.0));
+}
+
+TEST_CASE("sparse_matrix_axpy_1x1", "[sparse_matrix]")
+{
+
+  SparseMatrix<complex<double>> A(1,1);
+
+  std::vector< s1D<complex<double>> > I;
+  I.push_back(s1D<complex<double>>(0, 1.0));
+  A.initFroms1D(I,true);
+
+  //output_matrix(A);
+
+  complex<double> *B = new complex<double>[1];
+  complex<double> *C = new complex<double>[1];
+  B[0] = 1.0;
+  C[0] = 3.0;
+  int nrows = 1;
+  double alpha = 2.0;
+  double beta = 2.0;
+  SparseMatrixOperators::product_SpMatV(nrows, nrows, alpha, A.values(), A.column_data(), A.row_index(), B, beta, C);
+
+  REQUIRE(C[0].real() == Approx(8.0));
+  REQUIRE(C[0].imag() == Approx(0.0));
+}
+
+TEST_CASE("sparse_matrix_zaxpy_2x2", "[sparse_matrix]")
+{
+
+  SparseMatrix<complex<double>> A(2,2);
+
+  std::vector< s2D<complex<double>> > I;
+  I.push_back(s2D<complex<double>>(0,0, 2.0));
+  I.push_back(s2D<complex<double>>(0,1, 4.0));
+  // [[2.0 4.0]
+  //  [0.0 0.0]]
+  A.initFroms2D(I,true);
+
+  //output_matrix(A);
+
+  complex<double> *B = new complex<double>[2];
+  complex<double> *C = new complex<double>[2];
+  B[0] = 4.0;
+  B[1] = 3.0;
+  C[0] = 2.0;
+  C[1] = 1.3;
+  int nrows = 2;
+  double alpha = 2.0;
+  double beta = 2.0;
+  SparseMatrixOperators::product_SpMatV(nrows, nrows, alpha, A.values(), A.column_data(), A.row_index(), B, beta, C);
+
+  REQUIRE(C[0].real() == Approx(44.0));
+  REQUIRE(C[0].imag() == Approx(0.0));
+
+  REQUIRE(C[1].real() == Approx(2.6));
+  REQUIRE(C[1].imag() == Approx(0.0));
+}
+
+TEST_CASE("sparse_matrix_daxpy_2x2", "[sparse_matrix]")
+{
+
+  SparseMatrix<double> A(2,2);
+
+  std::vector< s2D<double> > I;
+  I.push_back(s2D<double>(0,0, 2.0));
+  I.push_back(s2D<double>(0,1, 4.0));
+  // [[2.0 4.0]
+  //  [0.0 0.0]]
+  A.initFroms2D(I,true);
+
+  //output_matrix(A);
+
+  double *B = new double[2];
+  double *C = new double[2];
+  B[0] = 4.0;
+  B[1] = 3.0;
+  C[0] = 2.0;
+  C[1] = 1.3;
+  int nrows = 2;
+  double alpha = 2.0;
+  double beta = 2.0;
+  SparseMatrixOperators::product_SpMatV(nrows, nrows, alpha, A.values(), A.column_data(), A.row_index(), B, beta, C);
+
+  REQUIRE(C[0] == Approx(44.0));
+  REQUIRE(C[1] == Approx(2.6));
+}
+
+TEST_CASE("sparse_matrix_zaxpy_T_2x2", "[sparse_matrix]")
+{
+
+  SparseMatrix<complex<double>> A(2,2);
+
+  std::vector< s2D<complex<double>> > I;
+  I.push_back(s2D<complex<double>>(0,0, 2.0));
+  I.push_back(s2D<complex<double>>(0,1, 4.0));
+  // [[2.0 4.0]
+  //  [0.0 0.0]]
+  A.initFroms2D(I,true);
+
+  //output_matrix(A);
+
+  complex<double> *B = new complex<double>[2];
+  complex<double> *C = new complex<double>[2];
+  B[0] = 4.0;
+  B[1] = 3.0;
+  C[0] = 2.0;
+  C[1] = 1.3;
+  int nrows = 2;
+  double alpha = 2.0;
+  double beta = 2.0;
+  SparseMatrixOperators::product_SpMatTV(nrows, nrows, alpha, A, B, beta, C);
+
+  for (int i = 0; i < nrows; i++) {
+    cout << C[i] << std::endl;
+  }
+
+  REQUIRE(C[0].real() == Approx(20.0));
+  REQUIRE(C[0].imag() == Approx(0.0));
+
+  REQUIRE(C[1].real() == Approx(34.6));
+  REQUIRE(C[1].imag() == Approx(0.0));
+
+  C[0] = 2.0;
+  C[1] = 1.3;
+  SparseMatrixOperators::product_SpMatTV(nrows, nrows, alpha, A.values(), A.column_data(), A.row_index(), B, beta, C);
+
+  REQUIRE(C[0].real() == Approx(20.0));
+  REQUIRE(C[0].imag() == Approx(0.0));
+
+  REQUIRE(C[1].real() == Approx(34.6));
+  REQUIRE(C[1].imag() == Approx(0.0));
+
+}
+
+TEST_CASE("sparse_matrix_mm_2x2", "[sparse_matrix]")
+{
+
+  SparseMatrix<complex<double>> A(2,2);
+
+  std::vector< s2D<complex<double>> > I;
+  I.push_back(s2D<complex<double>>(0,0, 2.0));
+  I.push_back(s2D<complex<double>>(0,1, 4.0));
+  // [[2.0 4.0]
+  //  [0.0 0.0]]
+  A.initFroms2D(I,true);
+
+  //output_matrix(A);
+
+  complex<double> *B = new complex<double>[4];
+  complex<double> *C = new complex<double>[4];
+  B[0] = 4.0;
+  B[1] = 3.0;
+  B[2] = 1.0;
+  B[3] = 5.0;
+
+  C[0] = 2.0;
+  C[1] = 1.3;
+  C[2] = 1.3;
+  C[3] = 1.3;
+  int nrows = 2;
+  double alpha = 2.0;
+  double beta = 2.0;
+  SparseMatrixOperators::product_SpMatM(nrows, nrows, nrows, alpha, A, B, nrows, beta, C, nrows);
+
+  //std::cout << C[0] << std::endl;
+  //std::cout << C[1] << std::endl;
+  //std::cout << C[2] << std::endl;
+  //std::cout << C[3] << std::endl;
+
+  REQUIRE(C[0].real() == Approx(28.0));
+  REQUIRE(C[0].imag() == Approx(0.0));
+
+  REQUIRE(C[1].real() == Approx(54.6));
+  REQUIRE(C[1].imag() == Approx(0.0));
+
+  REQUIRE(C[2].real() == Approx(2.6));
+  REQUIRE(C[2].imag() == Approx(0.0));
+
+  REQUIRE(C[3].real() == Approx(2.6));
+  REQUIRE(C[3].imag() == Approx(0.0));
+}
+
+TEST_CASE("sparse_matrix_real_mm_2x2", "[sparse_matrix]")
+{
+
+  const int N = 2; // nrows of A
+  const int M = 3; // ncols of C
+  const int K = 4; // ncols of A
+  // A is MxK
+  // B is KxN
+  // C is MxN
+  SparseMatrix<double> A(M,K);
+
+  std::vector< s2D<double> > I;
+  I.push_back(s2D<double>(0,0, 2.0));
+  I.push_back(s2D<double>(0,1, 4.0));
+  I.push_back(s2D<double>(1,2, 5.0));
+  I.push_back(s2D<double>(2,1, 1.0));
+  // [[2.0 4.0 0.0 0.0]
+  //  [0.0 0.0 5.0 0.0]
+  //  [1.0 0.0 0.0 0.0]]
+  A.initFroms2D(I,true);
+
+  //output_matrix(A);
+
+  double *B = new double[K*N];
+  double *C = new double[M*N];
+  B[0] = 4.0;
+  B[1] = 3.0;
+  B[2] = 1.0;
+  B[3] = 5.0;
+
+  C[0] = 2.0;
+  C[1] = 1.3;
+  C[2] = 1.3;
+  C[3] = 1.3;
+  double alpha = 2.0;
+  double beta = 2.0;
+  int ldb = N;
+  int ldc = N;
+  SparseMatrixOperators::product_SpMatM(M, N, K, alpha, A, B, ldb, beta, C, ldc);
+
+  //for (int i = 0; i < M*N; i++)
+  //{
+  //  std::cout << C[i] << std::endl;
+  //}
+
+  REQUIRE(C[0] == Approx(28.0));
+  REQUIRE(C[1] == Approx(54.6));
+  REQUIRE(C[2] == Approx(2.6));
+  REQUIRE(C[3] == Approx(2.6));
+  REQUIRE(C[4] == Approx(2.0));
+  REQUIRE(C[5] == Approx(10.0));
+
+}
+
+
+TEST_CASE("sparse_matrix_float_mm_2x2", "[sparse_matrix]")
+{
+
+  const int N = 2; // nrows of A
+  const int M = 3; // ncols of C
+  const int K = 4; // ncols of A
+  // A is MxK
+  // B is KxN
+  // C is MxN
+  SparseMatrix<float> A(M,K);
+
+  std::vector< s2D<double> > I;
+  I.push_back(s2D<double>(0,0, 2.0));
+  I.push_back(s2D<double>(0,1, 4.0));
+  I.push_back(s2D<double>(0,3, 1.0));
+  I.push_back(s2D<double>(1,2, 5.0));
+  I.push_back(s2D<double>(2,1, 1.0));
+  // [[2.0 4.0 0.0 1.0]
+  //  [0.0 0.0 5.0 0.0]
+  //  [1.0 0.0 0.0 0.0]]
+  A.initFroms2D(I,true);
+
+  //output_matrix(A);
+
+  float *B = new float[K*N];
+  float *C = new float[M*N];
+  for (int i = 0; i < K*N; i++) B[i] = 0;
+  for (int i = 0; i < M*N; i++) C[i] = 0;
+  B[0] = 4.0;
+  B[1] = 3.0;
+  B[2] = 1.0;
+  B[3] = 5.0;
+  B[6] = 1.0;
+
+  C[0] = 2.0;
+  C[1] = 1.3;
+  C[2] = 1.3;
+  C[3] = 1.3;
+  float alpha = 2.0;
+  float beta = 2.0;
+  int ldb = N;
+  int ldc = N;
+  SparseMatrixOperators::product_SpMatM(M, N, K, alpha, A, B, ldb, beta, C, ldc);
+
+  //for (int i = 0; i < M*N; i++)
+  //{
+  //  std::cout << C[i] << std::endl;
+  //}
+
+  REQUIRE(C[0] == Approx(30.0));
+  REQUIRE(C[1] == Approx(54.6));
+  REQUIRE(C[2] == Approx(2.6));
+  REQUIRE(C[3] == Approx(2.6));
+  REQUIRE(C[4] == Approx(2.0));
+  REQUIRE(C[5] == Approx(10.0));
+
+}
+
+#include "sparse_mult_cases.cpp"
+
+}
+
--- a/src/AFQMC/Utilities/tuple_iterator.hpp
+++ b/src/AFQMC/Utilities/tuple_iterator.hpp
@ -80,6 +80,9 @@ public:
   tuple_iterator(FirstIter first_, SecondIter second_, ThirdIter third_) :
       first(first_), second(second_),third(third_){}

+    // Need default constructor for inplace_merge in some versions of the C++ std library
+    tuple_iterator() {}
+
    using difference_type = D;
    using reference = R;

--- a/src/AFQMC/Walkers/DistWalkerHandler.cpp
+++ b/src/AFQMC/Walkers/DistWalkerHandler.cpp
@ -347,6 +347,7 @@ bool DistWalkerHandler::setup(int cr, int nc, int tgn, MPI_Comm heads_comm, MPI_
 bool DistWalkerHandler::clean()
 {
  walkers.clear();
+  return true;
 }

 // called at the beginning of each executable section
@ -627,13 +628,14 @@ void DistWalkerHandler::popControl()
        *(it+data_displ[INFO]) = minus;
      } else {
        ComplexType w0 = *(it+data_displ[WEIGHT]); 
-        if( std::abs(w0) < std::abs(min_weight)) 
+        if( std::abs(w0) < std::abs(min_weight)) {
          if( (int)(distribution(generator) + std::abs(w0)/reset_weight) == 0 ) { 
            *(it+data_displ[INFO]) = minus; 
            empty_spots.push_back(cnt);
          } else {
            *(it+data_displ[WEIGHT]) = reset_weight;
          } 
+        }
      }
    }

--- a/src/AFQMC/Walkers/LocalWalkerHandler.cpp
+++ b/src/AFQMC/Walkers/LocalWalkerHandler.cpp
@ -280,6 +280,7 @@ bool LocalWalkerHandler::clean()
 {
  for(int i=0; i<walkers.size(); i++) 
    walkers[i].clear();
+  return true;
 }

 // called at the beginning of each executable section
--- a/src/Particle/ParticleSet.cpp
+++ b/src/Particle/ParticleSet.cpp
@ -333,10 +333,19 @@ bool ParticleSet::get(std::ostream& os) const
  for (int i=0; i<SubPtcl.size(); i++)
    os << SubPtcl[i] << " ";
  os <<"\n\n    " << LocalNum << "\n\n";
-  for (int i=0; i<LocalNum; i++)
+  const int maxParticlesToPrint = 10;
+  int numToPrint = std::min(LocalNum, maxParticlesToPrint);
+
+  for (int i=0; i<numToPrint; i++)
  {
    os << "    " << mySpecies.speciesName[GroupID[i]]  << R[i] << std::endl;
  }
+
+  if (numToPrint < LocalNum)
+  {
+    os << "    (... and " << (LocalNum-numToPrint) << " more particle positions ...)" << std::endl;
+  }
+
  return true;
 }

--- a/src/Particle/tests/CMakeLists.txt
+++ b/src/Particle/tests/CMakeLists.txt
@ -16,7 +16,7 @@ SET(UTEST_EXE test_${SRC_DIR})
 SET(UTEST_NAME unit_test_${SRC_DIR})


-ADD_EXECUTABLE(${UTEST_EXE} test_particle.cpp test_distance_table.cpp)
+ADD_EXECUTABLE(${UTEST_EXE} test_particle.cpp test_distance_table.cpp test_walker.cpp)
 TARGET_LINK_LIBRARIES(${UTEST_EXE} qmcbase qmcutil ${QMC_UTIL_LIBS} ${MPI_LIBRARY})

 #ADD_TEST(NAME ${UTEST_NAME} COMMAND "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}")
--- a/src/Particle/tests/test_walker.cpp
+++ b/src/Particle/tests/test_walker.cpp
@ -0,0 +1,108 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by:  Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//
+// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "catch.hpp"
+
+#include "Configuration.h"
+#include "Particle/MCWalkerConfiguration.h"
+#include "Particle/HDFWalkerOutput.h"
+#include "Particle/HDFWalkerInput_0_4.h"
+
+
+
+#include <stdio.h>
+#include <string>
+
+using std::string;
+
+namespace qmcplusplus
+{
+
+
+typedef Walker<QMCTraits, PtclOnLatticeTraits> Walker_t;
+
+TEST_CASE("walker", "[particle]")
+{
+  OHMMS::Controller->initialize(0, NULL);
+  OhmmsInfo("testlogfile");
+
+  Walker_t w(1);
+  REQUIRE(w.R.size() == 1);
+  w.R[0] = 1.0;
+
+  REQUIRE(w.R[0][0] == Approx(1.0));
+}
+
+
+TEST_CASE("walker HDF read and write", "[particle]")
+{
+  OHMMS::Controller->initialize(0, NULL);
+  Communicate *c = OHMMS::Controller;
+  OhmmsInfo("testlogfile");
+
+  Walker_t w1(1);
+  w1.R[0] = 1.0;
+
+  Walker_t w2(1);
+  w2.R[0] = 0.5;
+
+  MCWalkerConfiguration W;
+
+
+  W.setName("electrons");
+
+  W.create(1);
+  W.R[0][0] = 0.0;
+  W.R[0][1] = 1.0;
+  W.R[0][2] = 2.0;
+
+  // This method sets ownership to false so class does not attempt to
+  // free the walker elements.
+  W.copyWalkerRefs(&w1,&w2);
+
+  REQUIRE(W.getActiveWalkers() == 2);
+
+  std::vector<int> walker_offset(c->size()+1);
+
+  walker_offset[0] = 0;
+  int offset = 2;
+  for (int i = 0; i < c->size(); i++)
+  {
+    walker_offset[i+1] = offset;
+    offset += 2;
+  }
+
+  W.setWalkerOffsets(walker_offset);
+
+  c->setName("walker_test");
+  HDFWalkerOutput hout(W, "this string apparently does nothing", c);
+  hout.dump(W, 0);
+
+  c->barrier();
+
+  MCWalkerConfiguration W2;
+  W2.setName("electrons");
+  W2.create(1);
+
+  HDFVersion version(0,4);
+  HDFWalkerInput_0_4 hinp(W2, c, version);
+  bool okay = hinp.read_hdf5("walker_test");
+  REQUIRE(okay);
+
+  REQUIRE(W2.getActiveWalkers() == 2);
+  for (int i = 0; i < 3; i++)
+  {
+    REQUIRE(W2[0]->R[0][i] == w1.R[0][i]);
+    REQUIRE(W2[1]->R[0][i] == w2.R[0][i]);
+  }
+}
+
+}
--- a/src/QMCApp/QMCMain.cpp
+++ b/src/QMCApp/QMCMain.cpp
@ -131,6 +131,8 @@ bool QMCMain::execute()

 #ifdef BUILD_AFQMC
  if(simulationType == "afqmc") {
+    NewTimer *t2 = TimerManager.createTimer("Total", timer_level_coarse);
+    ScopedTimer t2_scope(t2);
    app_log() << std::endl << "/*************************************************\n"
                      << " ********  This is an AFQMC calculation   ********\n"
                      << " *************************************************" <<std::endl;
@ -183,12 +185,10 @@ bool QMCMain::execute()
  }
 #endif

-  NewTimer *t2 = new NewTimer("Total", timer_level_coarse);
-  TimerManager.addTimer(t2);
+  NewTimer *t2 = TimerManager.createTimer("Total", timer_level_coarse);
  t2->start();

-  NewTimer *t3 = new NewTimer("Startup", timer_level_coarse);
-  TimerManager.addTimer(t3);
+  NewTimer *t3 = TimerManager.createTimer("Startup", timer_level_coarse);
  t3->start();

  //validate the input file
@ -625,8 +625,7 @@ bool QMCMain::runQMC(xmlNodePtr cur)
    qmcDriver->process(cur);
    OhmmsInfo::flush();
    Timer qmcTimer;
-    NewTimer *t1 = new NewTimer(qmcDriver->getEngineName(), timer_level_coarse);
-    TimerManager.addTimer(t1);
+    NewTimer *t1 = TimerManager.createTimer(qmcDriver->getEngineName(), timer_level_coarse);
    t1->start();
    qmcDriver->run();
    t1->stop();
--- a/src/QMCDrivers/WalkerControlBase.cpp
+++ b/src/QMCDrivers/WalkerControlBase.cpp
@ -252,6 +252,12 @@ int WalkerControlBase::branch(int iter, MCWalkerConfiguration& W, RealType trigg
  }
  //set the global number of walkers
  W.setGlobalNumWalkers(nw_tot);
+  // Update offsets in non-MPI case, needed to ensure checkpoint outputs the correct
+  // number of configurations.
+  if (W.WalkerOffsets.size() == 2)
+  {
+    W.WalkerOffsets[1] = nw_tot;
+  }
  return nw_tot;
 }

--- a/src/QMCWaveFunctions/TrialWaveFunction.cpp
+++ b/src/QMCWaveFunctions/TrialWaveFunction.cpp
@ -98,33 +98,18 @@ TrialWaveFunction::addOrbital(OrbitalBase* aterm, const std::string& aname, bool
    FermionWF=dynamic_cast<FermionBase*>(aterm);
  }

-//#if defined(QMC_CUDA)
-  char name1[64],name2[64],name3[64],name4[64], name5[64], name6[64];
-  sprintf(name1,"WaveFunction::%s_V",aname.c_str());
-  sprintf(name2,"WaveFunction::%s_VGL",aname.c_str());
-  sprintf(name3,"WaveFunction::%s_accept",aname.c_str());
-  sprintf(name4,"WaveFunction::%s_NLratio",aname.c_str());
-  sprintf(name5,"WaveFunction::%s_recompute",aname.c_str());
-  sprintf(name6,"WaveFunction::%s_derivs",aname.c_str());
-  NewTimer *vtimer=new NewTimer(name1);
-  NewTimer *vgltimer=new NewTimer(name2);
-  NewTimer *accepttimer=new NewTimer(name3);
-  NewTimer *NLtimer=new NewTimer(name4);
-  NewTimer *recomputetimer=new NewTimer(name5);
-  NewTimer *derivstimer=new NewTimer(name6);
-  myTimers.push_back(vtimer);
-  myTimers.push_back(vgltimer);
-  myTimers.push_back(accepttimer);
-  myTimers.push_back(NLtimer);
-  myTimers.push_back(recomputetimer);
-  myTimers.push_back(derivstimer);
-  TimerManager.addTimer(vtimer);
-  TimerManager.addTimer(vgltimer);
-  TimerManager.addTimer(accepttimer);
-  TimerManager.addTimer(NLtimer);
-  TimerManager.addTimer(recomputetimer);
-  TimerManager.addTimer(derivstimer);
-//#endif
+  std::vector<std::string> suffixes(6);
+  suffixes[0] = "_V";
+  suffixes[1] = "_VGL";
+  suffixes[2] = "_accept";
+  suffixes[3] = "_NLratio";
+  suffixes[4] = "_recompute";
+  suffixes[5] = "_derivs";
+  for (int i = 0; i < suffixes.size(); i++)
+  {
+    std::string name = "WaveFunction::" + aname + suffixes[i];
+    myTimers.push_back(TimerManager.createTimer(name));
+  }
 }


--- a/src/Utilities/NewTimer.cpp
+++ b/src/Utilities/NewTimer.cpp
@ -70,6 +70,14 @@ void TimerManagerClass::addTimer(NewTimer* t)
  }
 }

+NewTimer *TimerManagerClass::createTimer(const std::string& myname, timer_levels mytimer)
+{
+  NewTimer *t = new NewTimer(myname, mytimer);
+  addTimer(t);
+  return t;
+}
+
+
 void TimerManagerClass::reset()
 {
  for (int i=0; i<TimerList.size(); i++)
--- a/src/Utilities/NewTimer.h
+++ b/src/Utilities/NewTimer.h
@ -140,6 +140,7 @@ public:
  TimerManagerClass():timer_threshold(timer_level_coarse),max_timer_id(1),
    max_timers_exceeded(false) {}
  void addTimer (NewTimer* t);
+  NewTimer *createTimer(const std::string& myname, timer_levels mytimer = timer_level_fine);

  void push_timer(NewTimer *t)
  {
@ -398,6 +399,23 @@ public:
 #endif
 };

+// Wrapper for timer that starts on construction and stops on destruction
+class ScopedTimer
+{
+public:
+  ScopedTimer(NewTimer *t) : timer(t)
+  {
+    if (timer) timer->start();
+  }
+
+  ~ScopedTimer()
+  {
+    if (timer) timer->stop();
+  }
+private:
+  NewTimer *timer;
+};
+
 struct TimerComparator
 {
  inline bool operator()(const NewTimer *a, const NewTimer *b)
@ -406,6 +424,7 @@ struct TimerComparator
  }
 };

+
 }

 #endif
--- a/src/Utilities/tests/test_timer.cpp
+++ b/src/Utilities/tests/test_timer.cpp
@ -48,18 +48,30 @@ TEST_CASE("test_timer_stack", "[utilities]")
  // Use a local version rather than the global TimerManager, otherwise
  //  changes will persist from test to test.
  TimerManagerClass tm;
-  NewTimer t1("timer1", timer_level_coarse);
-  tm.addTimer(&t1);
+  NewTimer *t1 = tm.createTimer("timer1", timer_level_coarse);
 #if ENABLE_TIMERS
 #ifdef USE_STACK_TIMERS
-  t1.start();
-  REQUIRE(tm.current_timer() == &t1);
-  t1.stop();
+  t1->start();
+  REQUIRE(tm.current_timer() == t1);
+  t1->stop();
  REQUIRE(tm.current_timer() == NULL);
 #endif
 #endif
 }

+TEST_CASE("test_timer_scoped", "[utilities]")
+{
+  TimerManagerClass tm;
+  NewTimer *t1 = tm.createTimer("timer1", timer_level_coarse);
+  {
+    ScopedTimer st(t1);
+  }
+#if ENABLE_TIMERS
+  REQUIRE(t1->get_total() == Approx(1.0));
+  REQUIRE(t1->get_num_calls() == 1);
+#endif
+}
+
 TEST_CASE("test_timer_flat_profile", "[utilities]")
 {
  TimerManagerClass tm;
--- a/tests/test_automation/nightly_ornl_oxygen.sh
+++ b/tests/test_automation/nightly_ornl_oxygen.sh
@ -59,7 +59,7 @@ case $sys in
 	export PATH=$HOME/apps/openmpi-2.0.2/bin/:$PATH
 	export LD_LIBRARY_PATH=$HOME/apps/openmpi-2.0.2/lib/:$LD_LIBRARY_PATH
 	export QMCPACK_TEST_SUBMIT_NAME=GCC-Release
-	ctest -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV -E 'long' --timeout 1800
+	ctest -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DBUILD_AFQMC=1 -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV -E 'long' --timeout 1800
 #	module unload mpi
 	export PATH=$OLD_PATH
 	export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH
@ -98,7 +98,7 @@ case $sys in
 	export OLD_MKLROOT=$MKLROOT
 	source /opt/intel2017/bin/compilervars.sh intel64
 	export QMCPACK_TEST_SUBMIT_NAME=Intel2017-Release
-	ctest -DCMAKE_C_COMPILER=mpiicc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV -E 'long' --timeout 1800
+	ctest -DCMAKE_C_COMPILER=mpiicc -DCMAKE_CXX_COMPILER=mpiicpc -DBUILD_AFQMC=1 -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV -E 'long' --timeout 1800
 	export PATH=$OLD_PATH
 	export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH
 	export MANPATH=$OLD_MANPATH
--- a/tests/test_automation/weekly_ornl_oxygen.sh
+++ b/tests/test_automation/weekly_ornl_oxygen.sh
@ -60,7 +60,7 @@ case $sys in
 	export PATH=$HOME/apps/openmpi-2.0.2/bin/:$PATH
 	export LD_LIBRARY_PATH=$HOME/apps/openmpi-2.0.2/lib/:$LD_LIBRARY_PATH
 	export QMCPACK_TEST_SUBMIT_NAME=GCC-Release
-	ctest -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV --timeout 7200
+	ctest -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DBUILD_AFQMC=1 -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV --timeout 7200
 #	module unload mpi
 	export PATH=$OLD_PATH
 	export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH
@ -99,7 +99,7 @@ case $sys in
 	export OLD_MKLROOT=$MKLROOT
 	source /opt/intel2017/bin/compilervars.sh intel64
 	export QMCPACK_TEST_SUBMIT_NAME=Intel2017-Release
-	ctest -DCMAKE_C_COMPILER=mpiicc -DCMAKE_CXX_COMPILER=mpiicpc -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV --timeout 7200
+	ctest -DCMAKE_C_COMPILER=mpiicc -DCMAKE_CXX_COMPILER=mpiicpc -DBUILD_AFQMC=1 -DQMC_DATA=${QMC_DATA} -DENABLE_TIMERS=1 -S $PWD/../qmcpack/CMake/ctest_script.cmake,release -VV --timeout 7200
 	export PATH=$OLD_PATH
 	export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH
 	export MANPATH=$OLD_MANPATH
				`@ -0,0 +1,2 @@`

				`// Run gensparse.py to create this file with content`