Merge branch 'develop' into nx_batched_drivers

2022-09-22 15:02:16 -04:00 · 2022-09-22 15:02:16 -04:00 · c31a404b42
parent 9765e74396 5f5b608847
commit c31a404b42
3 changed files with 31 additions and 36 deletions
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
@ -1126,8 +1126,8 @@ public:
  void vHS(MatA& Xw, MatB&& v, double a = 1., double c = 0.)
  {
    int nkpts = nopk.size();
-    int nwalk = Xw.size(1);
-    assert(v.size(0) == nwalk);
+    int nwalk = std::get<1>(Xw.sizes());
+    assert(v.size() == nwalk);
    int nspin     = (walker_type == COLLINEAR ? 2 : 1);
    int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
    int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
@ -1139,7 +1139,7 @@ public:
    SPComplexType halfa(0.5 * a, 0.0);
    size_t local_memory_needs = nmo_max * nmo_max * nwalk;
    if (TMats.num_elements() < local_memory_needs)
-      TMats.reextent({local_memory_needs, 1});
+      TMats.reextent({static_cast<ptrdiff_t>(local_memory_needs), 1});

    using vType = typename std::decay<MatB>::type::element;
    boost::multi::array_ref<vType, 3> v3D(to_address(v.origin()), {nwalk, nmo_tot, nmo_tot});
--- a/src/AFQMC/Wavefunctions/NOMSD.icc
+++ b/src/AFQMC/Wavefunctions/NOMSD.icc
@ -434,16 +434,6 @@ void NOMSD<devPsiT>::MixedDensityMatrix_for_E_from_SM(const MatSM& SM,
  TG.local_barrier();
 }

-template<class T, class... As>
-auto Sizes(boost::multi::array<T, 2, As...> const& m)
->decltype(m.sizes()) {
-	return m.sizes(); }
-
-template<class Matrix>
-auto Sizes(Matrix const& m)
->decltype(std::array<long, 2>{static_cast<long>(std::get<0>(m.sizes())), static_cast<long>(std::get<1>(m.sizes()))}) {
-	return std::array<long, 2>{static_cast<long>(std::get<0>(m.sizes())), static_cast<long>(std::get<1>(m.sizes()))}; }
-
 /* 
   * Computes the density matrix for a given reference. 
   * G and Ov are expected to be in shared memory.
@ -480,9 +470,9 @@ void NOMSD<devPsiT>::DensityMatrix_shared(const WlkSet& wset,
  if (walker_type != COLLINEAR)
  {
    if (herm) {
-      assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
    } else {
-      assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
    }

    auto Gdims = dm_dims(not compact, Alpha);
@ -507,14 +497,14 @@ void NOMSD<devPsiT>::DensityMatrix_shared(const WlkSet& wset,
  else
  {
    if (herm) {
-      assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
    } else {
-      assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
    }
    if (herm) {
-      assert(std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).second);
+      assert(std::get<0>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<1>(RefB.sizes()) == dm_dims(false, Beta).second);
    } else {
-      assert(std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).second);
+      assert(std::get<1>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<0>(RefB.sizes()) == dm_dims(false, Beta).second);
    }
    StaticVector ovlp2(iextensions<1u>{2 * nw}, buffer_manager.get_generator().template get_allocator<ComplexType>());
    fill_n(ovlp2.origin(), 2 * nw, ComplexType(0.0));
@ -618,9 +608,9 @@ void NOMSD<devPsiT>::DensityMatrix_batched(const WlkSet& wset,
  if (walker_type != COLLINEAR)
  {
    if (herm) {
-      assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
    } else {
-      assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
    }
    Static3Tensor G3D_({nbatch__, GAdims.first, GAdims.second},
                       buffer_manager.get_generator().template get_allocator<ComplexType>());
@ -658,14 +648,14 @@ void NOMSD<devPsiT>::DensityMatrix_batched(const WlkSet& wset,
  else
  {
    if (herm) {
-      assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
    } else {
-      assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
+      assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
    }
    if (herm) {
-      assert(std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).second);
+      assert(std::get<0>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<1>(RefB.sizes()) == dm_dims(false, Beta).second);
    } else {
-      assert(std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).second);
+      assert(std::get<1>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<0>(RefB.sizes()) == dm_dims(false, Beta).second);
    }

    auto GBdims = dm_dims(not compact, Beta);
--- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp
@ -60,18 +60,19 @@ bool SplineR2R<ST>::write_splines(hdf_archive& h5f)
  spl_coefs has a complicated layout depending on dimensionality of splines. 
  Luckily, for our purposes, we can think of spl_coefs as pointing to a 
  matrix of size BasisSetSize x (OrbitalSetSize + padding), with the spline 
-  index adjacent in memory. NB: The orbital index is SIMD aligned and
-  therefore may include padding.
+  index adjacent in memory. The orbital index is SIMD aligned and therefore 
+  may include padding.
    
-  In other words, due to SIMD alignment, Nsplines may be larger than the 
-  actual number of splined orbitals, which means that in practice rot_mat 
-  may be smaller than the number of 'columns' in the coefs array. 
-  Therefore, we put rot_mat inside "tmpU". The padding of the splines 
-  is at the end, so if we put rot_mat at top left corner of tmpU, then 
-  we can apply tmpU to the coefs safely regardless of padding.   
+  As a result, due to SIMD alignment, Nsplines may be larger than the 
+  actual number of splined orbitals. This means that in practice rot_mat 
+  may be smaller than the number of 'columns' in the coefs array! 
+  To fix this problem, we put rot_mat inside "tmpU", which is guaranteed to have
+  the 'right' size to match spl_coefs. The padding of the splines is at the end, 
+  so if we put rot_mat at top left corner of tmpU, then we can apply tmpU to the 
+  coefs safely regardless of padding.   
  
-  Typically, BasisSetSize >> OrbitalSetSize, so the spl_coefs "matrix"
-  is very tall and skinny.
+  NB: For splines (typically) BasisSetSize >> OrbitalSetSize, so the spl_coefs 
+  "matrix" is very tall and skinny.
 */
 template<typename ST>
 void SplineR2R<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_copy)
@ -99,6 +100,7 @@ void SplineR2R<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co
  }

  // Apply rotation the dumb way b/c I can't get BLAS::gemm to work...
+  std::vector<RealType> new_coefs(coefs_tot_size, 0);
  for (auto i = 0; i < BasisSetSize; i++)
  {
    for (auto j = 0; j < Nsplines; j++)
@ -110,10 +112,13 @@ void SplineR2R<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co
        const auto index = i * Nsplines + k;
        newval += *(spl_coefs + index) * tmpU[k][j];
      }
-      *(spl_coefs + cur_elem) = newval;
+      new_coefs[cur_elem] = newval;
    }
  }

+  // Update the coefs
+  std::copy(new_coefs.begin(), new_coefs.end(), spl_coefs);
+
  /*
    // Here is my attempt to use gemm but it doesn't work...
    int smaller_BasisSetSize   = static_cast<int>(BasisSetSize);