Merge branch 'develop' into nx_batched_drivers

This commit is contained in:
Paul R. C. Kent 2022-09-22 15:02:16 -04:00 committed by GitHub
commit c31a404b42
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 36 deletions

View File

@ -1126,8 +1126,8 @@ public:
void vHS(MatA& Xw, MatB&& v, double a = 1., double c = 0.)
{
int nkpts = nopk.size();
int nwalk = Xw.size(1);
assert(v.size(0) == nwalk);
int nwalk = std::get<1>(Xw.sizes());
assert(v.size() == nwalk);
int nspin = (walker_type == COLLINEAR ? 2 : 1);
int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0);
int nmo_max = *std::max_element(nopk.begin(), nopk.end());
@ -1139,7 +1139,7 @@ public:
SPComplexType halfa(0.5 * a, 0.0);
size_t local_memory_needs = nmo_max * nmo_max * nwalk;
if (TMats.num_elements() < local_memory_needs)
TMats.reextent({local_memory_needs, 1});
TMats.reextent({static_cast<ptrdiff_t>(local_memory_needs), 1});
using vType = typename std::decay<MatB>::type::element;
boost::multi::array_ref<vType, 3> v3D(to_address(v.origin()), {nwalk, nmo_tot, nmo_tot});

View File

@ -434,16 +434,6 @@ void NOMSD<devPsiT>::MixedDensityMatrix_for_E_from_SM(const MatSM& SM,
TG.local_barrier();
}
template<class T, class... As>
auto Sizes(boost::multi::array<T, 2, As...> const& m)
->decltype(m.sizes()) {
return m.sizes(); }
template<class Matrix>
auto Sizes(Matrix const& m)
->decltype(std::array<long, 2>{static_cast<long>(std::get<0>(m.sizes())), static_cast<long>(std::get<1>(m.sizes()))}) {
return std::array<long, 2>{static_cast<long>(std::get<0>(m.sizes())), static_cast<long>(std::get<1>(m.sizes()))}; }
/*
* Computes the density matrix for a given reference.
* G and Ov are expected to be in shared memory.
@ -480,9 +470,9 @@ void NOMSD<devPsiT>::DensityMatrix_shared(const WlkSet& wset,
if (walker_type != COLLINEAR)
{
if (herm) {
assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
} else {
assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
}
auto Gdims = dm_dims(not compact, Alpha);
@ -507,14 +497,14 @@ void NOMSD<devPsiT>::DensityMatrix_shared(const WlkSet& wset,
else
{
if (herm) {
assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
} else {
assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
}
if (herm) {
assert(std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).second);
assert(std::get<0>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<1>(RefB.sizes()) == dm_dims(false, Beta).second);
} else {
assert(std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).second);
assert(std::get<1>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<0>(RefB.sizes()) == dm_dims(false, Beta).second);
}
StaticVector ovlp2(iextensions<1u>{2 * nw}, buffer_manager.get_generator().template get_allocator<ComplexType>());
fill_n(ovlp2.origin(), 2 * nw, ComplexType(0.0));
@ -618,9 +608,9 @@ void NOMSD<devPsiT>::DensityMatrix_batched(const WlkSet& wset,
if (walker_type != COLLINEAR)
{
if (herm) {
assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
} else {
assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
}
Static3Tensor G3D_({nbatch__, GAdims.first, GAdims.second},
buffer_manager.get_generator().template get_allocator<ComplexType>());
@ -658,14 +648,14 @@ void NOMSD<devPsiT>::DensityMatrix_batched(const WlkSet& wset,
else
{
if (herm) {
assert(std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).second);
} else {
assert(std::get<1>(Sizes(RefA)) == dm_dims(false, Alpha).first && std::get<0>(Sizes(RefA)) == dm_dims(false, Alpha).second);
assert(std::get<1>(RefA.sizes()) == dm_dims(false, Alpha).first && std::get<0>(RefA.sizes()) == dm_dims(false, Alpha).second);
}
if (herm) {
assert(std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).second);
assert(std::get<0>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<1>(RefB.sizes()) == dm_dims(false, Beta).second);
} else {
assert(std::get<1>(Sizes(RefB)) == dm_dims(false, Beta).first && std::get<0>(Sizes(RefB)) == dm_dims(false, Beta).second);
assert(std::get<1>(RefB.sizes()) == dm_dims(false, Beta).first && std::get<0>(RefB.sizes()) == dm_dims(false, Beta).second);
}
auto GBdims = dm_dims(not compact, Beta);

View File

@ -60,18 +60,19 @@ bool SplineR2R<ST>::write_splines(hdf_archive& h5f)
spl_coefs has a complicated layout depending on dimensionality of splines.
Luckily, for our purposes, we can think of spl_coefs as pointing to a
matrix of size BasisSetSize x (OrbitalSetSize + padding), with the spline
index adjacent in memory. NB: The orbital index is SIMD aligned and
therefore may include padding.
index adjacent in memory. The orbital index is SIMD aligned and therefore
may include padding.
In other words, due to SIMD alignment, Nsplines may be larger than the
actual number of splined orbitals, which means that in practice rot_mat
may be smaller than the number of 'columns' in the coefs array.
Therefore, we put rot_mat inside "tmpU". The padding of the splines
is at the end, so if we put rot_mat at top left corner of tmpU, then
we can apply tmpU to the coefs safely regardless of padding.
As a result, due to SIMD alignment, Nsplines may be larger than the
actual number of splined orbitals. This means that in practice rot_mat
may be smaller than the number of 'columns' in the coefs array!
To fix this problem, we put rot_mat inside "tmpU", which is guaranteed to have
the 'right' size to match spl_coefs. The padding of the splines is at the end,
so if we put rot_mat at top left corner of tmpU, then we can apply tmpU to the
coefs safely regardless of padding.
Typically, BasisSetSize >> OrbitalSetSize, so the spl_coefs "matrix"
is very tall and skinny.
NB: For splines (typically) BasisSetSize >> OrbitalSetSize, so the spl_coefs
"matrix" is very tall and skinny.
*/
template<typename ST>
void SplineR2R<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_copy)
@ -99,6 +100,7 @@ void SplineR2R<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co
}
// Apply rotation the dumb way b/c I can't get BLAS::gemm to work...
std::vector<RealType> new_coefs(coefs_tot_size, 0);
for (auto i = 0; i < BasisSetSize; i++)
{
for (auto j = 0; j < Nsplines; j++)
@ -110,10 +112,13 @@ void SplineR2R<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co
const auto index = i * Nsplines + k;
newval += *(spl_coefs + index) * tmpU[k][j];
}
*(spl_coefs + cur_elem) = newval;
new_coefs[cur_elem] = newval;
}
}
// Update the coefs
std::copy(new_coefs.begin(), new_coefs.end(), spl_coefs);
/*
// Here is my attempt to use gemm but it doesn't work...
int smaller_BasisSetSize = static_cast<int>(BasisSetSize);