fixed issues on gpu with recent noncollinear changes, all unit tests pass non-collinear KP input

This commit is contained in:
mmorale3 2020-09-16 09:39:44 -07:00
parent 41afd00404
commit cc45a76609
16 changed files with 374 additions and 158 deletions

View File

@ -49,7 +49,8 @@ public:
bool operator!=(allocator const& o) const{return mp_ != o.mp_;}
using void_pointer = typename std::pointer_traits<decltype(std::declval<memory_type*>()->allocate(0, 0))>::template rebind<void>;
pointer allocate(size_type n){
return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), alignof(T))));
return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), 16)));
//return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), alignof(T))));
}
void deallocate(pointer p, size_type n){
mp_->deallocate(p, n*sizeof(value_type));

View File

@ -82,6 +82,8 @@ class KP3IndexFactorization_batched
using CMatrix_cref = boost::multi::array_ref<ComplexType const, 2, const_pointer>;
using CVector_ref = ComplexVector_ref<pointer>;
using CMatrix_ref = ComplexMatrix_ref<pointer>;
using C3Tensor_ref = Complex3Tensor_ref<pointer>;
using C4Tensor_ref = ComplexArray_ref<4, pointer>;
using C3Tensor_cref = boost::multi::array_ref<ComplexType const, 3, const_pointer>;
using SpMatrix_cref = boost::multi::array_ref<SPComplexType const, 2, sp_pointer>;
@ -338,50 +340,96 @@ public:
{
int nkpts = nopk.size();
int NMO = std::accumulate(nopk.begin(), nopk.end(), 0);
// in non-collinear case with SO, keep SO matrix here and add it
// for now, stay collinear
int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
CVector vMF_(vMF);
CVector P1D(iextensions<1u>{NMO * NMO});
fill_n(P1D.origin(), P1D.num_elements(), ComplexType(0));
vHS(vMF_, P1D);
CVector P0D(iextensions<1u>{NMO * NMO});
fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0));
vHS(vMF_, P0D);
if (TG_.TG().size() > 1)
TG_.TG().all_reduce_in_place_n(to_address(P1D.origin()), P1D.num_elements(), std::plus<>());
TG_.TG().all_reduce_in_place_n(to_address(P0D.origin()), P0D.num_elements(), std::plus<>());
boost::multi::array<ComplexType, 2> P1({NMO, NMO});
copy_n(P1D.origin(), NMO * NMO, P1.origin());
boost::multi::array<ComplexType, 2> P0({NMO, NMO});
copy_n(P0D.origin(), NMO * NMO, P0.origin());
// add H1 + vn0 and symmetrize
using ma::conj;
boost::multi::array<ComplexType, 2> P1({npol*NMO, npol*NMO});
std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0));
// add spin-dependent H1
for (int K = 0, nk0 = 0; K < nkpts; ++K)
{
for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
{
P1[I][I] += H1[K][i][i] + vn0[K][i][i];
for(int p=0; p<npol; ++p)
P1[p*NMO+I][p*NMO+I] += H1[K][p*nopk[K]+i][p*nopk[K]+i];
for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
{
P1[I][J] += H1[K][i][j] + vn0[K][i][j];
P1[J][I] += H1[K][j][i] + vn0[K][j][i];
// This is really cutoff dependent!!!
#if MIXED_PRECISION
if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
for(int p=0; p<npol; ++p)
{
#else
if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
{
#endif
app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << " " << H1[K][i][j] << " "
<< H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
//APP_ABORT("Error in getOneBodyPropagatorMatrix. H1 is not hermitian. \n");
P1[p*NMO+I][p*NMO+J] += H1[K][p*nopk[K]+i][p*nopk[K]+j];
P1[p*NMO+J][p*NMO+I] += H1[K][p*nopk[K]+j][p*nopk[K]+i];
}
}
if(walker_type == NONCOLLINEAR) {
// offdiagonal piece
for (int j = 0, J = nk0; j < nopk[K]; j++, J++)
{
P1[I][NMO+J] += H1[K][i][nopk[K]+j];
P1[NMO+J][I] += H1[K][nopk[K]+j][i];
}
P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
P1[J][I] = ma::conj(P1[I][J]);
}
}
nk0 += nopk[K];
}
// add P0 (diagonal in spin)
for(int p=0; p<npol; ++p)
for(int I=0; I<NMO; I++)
for(int J=0; J<NMO; J++)
P1[p*NMO+I][p*NMO+J] += P0[I][J];
// add vn0 (diagonal in spin)
for (int K = 0, nk0 = 0; K < nkpts; ++K)
{
for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
{
for(int p=0; p<npol; ++p)
P1[p*NMO+I][p*NMO+I] += vn0[K][i][i];
for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
{
for(int p=0; p<npol; ++p)
{
P1[p*NMO+I][p*NMO+J] += vn0[K][i][j];
P1[p*NMO+J][p*NMO+I] += vn0[K][j][i];
}
}
}
nk0 += nopk[K];
}
using ma::conj;
// symmetrize
for(int I=0; I<npol*NMO; I++)
{
for(int J=I+1; J<npol*NMO; J++)
{
// This is really cutoff dependent!!!
#if MIXED_PRECISION
if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
{
#else
if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
{
#endif
app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] <<std::endl;
//<< H1[K][i][j] << " "
//<< H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
}
P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
P1[J][I] = ma::conj(P1[I][J]);
}
}
return P1;
}
@ -438,6 +486,7 @@ public:
int nwalk = Gc.size(1);
int nspin = (walker_type == COLLINEAR ? 2 : 1);
int npol = (walker_type == NONCOLLINEAR ? 2 : 1);
int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0);
int nmo_max = *std::max_element(nopk.begin(), nopk.end());
int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@ -495,13 +544,13 @@ public:
for (int n = 0; n < nwalk; n++)
fill_n(E[n].origin(), 3, ComplexType(0.));
assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
C3Tensor_cref G3Da(make_device_ptr(Gc.origin()), {nocca_tot, nmo_tot, nwalk});
assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
C3Tensor_cref G3Da(make_device_ptr(Gc.origin()), {nocca_tot*npol, nmo_tot, nwalk});
C3Tensor_cref G3Db(make_device_ptr(Gc.origin()) + G3Da.num_elements() * (nspin - 1), {noccb_tot, nmo_tot, nwalk});
// later on, rewrite routine to loop over spins, to avoid storage of both spin
// components simultaneously
Static4Tensor GKK({nspin, nkpts, nkpts, nwalk * nmo_max * nocc_max},
Static4Tensor GKK({nspin, nkpts, nkpts, nwalk * npol * nmo_max * nocc_max},
device_buffer_allocator->template get_allocator<SPComplexType>());
GKaKjw_to_GKKwaj(G3Da, GKK[0], nelpk[nd].sliced(0, nkpts), dev_nelpk[nd], dev_a0pk[nd]);
if (walker_type == COLLINEAR)
@ -519,10 +568,11 @@ public:
for (int K = 0; K < nkpts; ++K)
{
#if defined(MIXED_PRECISION)
CMatrix_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, nmo_max});
C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max});
for (int a = 0; a < nelpk[nd][K]; ++a)
ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a].sliced(0, nopk[K]),
ComplexType(1.), E({0, nwalk}, 0));
for (int pol = 0; pol < npol; ++pol)
ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+p].sliced(nk, nk + nopk[K])),
haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0));
na += nelpk[nd][K];
if (walker_type == COLLINEAR)
{
@ -538,8 +588,8 @@ public:
nk = nopk[K];
{
na = nelpk[nd][K];
CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * nmo_max});
SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * nmo_max});
CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * npol * nmo_max});
SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * npol * nmo_max});
ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E({0, nwalk}, 0));
}
if (walker_type == COLLINEAR)
@ -582,7 +632,7 @@ public:
// I WANT C++17!!!!!!
long mem_ank(0);
if (needs_copy)
mem_ank = nkpts * nocc_max * nchol_max * nmo_max;
mem_ank = nkpts * nocc_max * nchol_max * npol * nmo_max;
StaticVector LBuff(iextensions<1u>{2 * mem_ank},
device_buffer_allocator->template get_allocator<SPComplexType>());
sp_pointer LQptr(nullptr), LQmptr(nullptr);
@ -660,8 +710,8 @@ public:
if (batch_cnt >= batch_size)
{
gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0),
Aarray.data(), nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(),
gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0),
Aarray.data(), npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(),
nocc_max * nchol_max, Aarray.size());
copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin());
@ -691,8 +741,8 @@ public:
if (batch_cnt > 0)
{
gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0), Aarray.data(),
nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max,
gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0), Aarray.data(),
npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max,
Aarray.size());
copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin());
@ -1322,6 +1372,7 @@ public:
assert(v.size(0) == 2 * local_nCV);
assert(v.size(1) == nwalk);
int nspin = (walker_type == COLLINEAR ? 2 : 1);
int npol = (walker_type == NONCOLLINEAR ? 2 : 1);
int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0);
int nmo_max = *std::max_element(nopk.begin(), nopk.end());
int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@ -1340,11 +1391,11 @@ public:
SPComplexType minusimhalfa(0.0, -0.5 * a * scl);
SPComplexType imhalfa(0.0, 0.5 * a * scl);
assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
// MAM: use reshape when available, then no need to deal with types
using GType = typename std::decay<MatA>::type::element;
boost::multi::array_ref<GType const, 3, decltype(make_device_ptr(G.origin()))> G3Da(make_device_ptr(G.origin()),
{nocca_tot, nmo_tot, nwalk});
{nocca_tot*npol, nmo_tot, nwalk});
boost::multi::array_ref<GType const, 3, decltype(make_device_ptr(G.origin()))> G3Db(make_device_ptr(G.origin()) +
G3Da.num_elements() *
(nspin - 1),
@ -1358,7 +1409,7 @@ public:
size_t cnt(0);
Static3Tensor v1({nkpts + number_of_symmetric_Q, nchol_max, nwalk},
device_buffer_allocator->template get_allocator<SPComplexType>());
Static3Tensor GQ({nkpts, nkpts * nocc_max * nmo_max, nwalk},
Static3Tensor GQ({nkpts, nkpts * nocc_max * npol * nmo_max, nwalk},
device_buffer_allocator->template get_allocator<SPComplexType>());
fill_n(v1.origin(), v1.num_elements(), SPComplexType(0.0));
fill_n(GQ.origin(), GQ.num_elements(), SPComplexType(0.0));
@ -1370,7 +1421,7 @@ public:
dev_a0pk[nd].sliced(nkpts, 2 * nkpts));
// can use productStridedBatched if LQKakn is changed to a 3Tensor array
int Kak = nkpts * nocc_max * nmo_max;
int Kak = nkpts * nocc_max * npol * nmo_max;
std::vector<sp_pointer> Aarray;
std::vector<sp_pointer> Barray;
std::vector<sp_pointer> Carray;
@ -1530,30 +1581,32 @@ private:
template<class MatA, class MatB, class IVec, class IVec2>
void GKaKjw_to_GKKwaj(MatA const& GKaKj, MatB&& GKKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0)
{
int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
int nmo_max = *std::max_element(nopk.begin(), nopk.end());
// int nocc_max = *std::max_element(nocc.begin(),nocc.end());
int nmo_tot = GKaKj.size(1);
int nwalk = GKaKj.size(2);
int nkpts = nopk.size();
assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max);
assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max);
using ma::KaKjw_to_KKwaj;
KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
dev_a0.origin(), GKaKj.origin(), GKKaj.origin());
}
template<class MatA, class MatB, class IVec, class IVec2>
void GKaKjw_to_GQKajw(MatA const& GKaKj, MatB&& GQKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0)
{
int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
int nmo_max = *std::max_element(nopk.begin(), nopk.end());
// int nocc_max = *std::max_element(nocc.begin(),nocc.end());
int nmo_tot = GKaKj.size(1);
int nwalk = GKaKj.size(2);
int nkpts = nopk.size();
assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max);
assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max);
using ma::KaKjw_to_QKajw;
KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
dev_a0.origin(), dev_QKToK2.origin(), GKaKj.origin(), GQKaj.origin());
}

View File

@ -1143,7 +1143,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
for (int nd = 0; nd < ndet; nd++)
{
for (int Q = 0; Q < number_of_symmetric_Q; Q++)
LQKbnl.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
LQKbnl.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator<SPComplexType>{distNode}));
if (type == COLLINEAR)
{
for (int Q = 0; Q < number_of_symmetric_Q; Q++)
@ -1161,7 +1161,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
{
for (int Q = 0; Q < number_of_symmetric_Q; Q++)
{
LQKbln.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
LQKbln.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator<SPComplexType>{distNode}));
}
if (type == COLLINEAR)
{

View File

@ -24,11 +24,12 @@
namespace kernels
{
// very sloppy, needs improvement!!!!
// A[nocc_tot][nmo_tot][nwalk]
// B[Ka][Kj][nwalk][nocc_max][nmo_max]
// A[nocc_tot][s][nmo_tot][nwalk]
// B[Ka][Kj][nwalk][nocc_max][s][nmo_max]
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -41,30 +42,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
{
int Ka = blockIdx.x;
int Kj = blockIdx.y;
if (Ka >= nkpts || Kj >= nkpts)
int pol = blockIdx.z;
if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
return;
int na0 = nocc0[Ka];
int nj0 = nmo0[Kj];
int na = nocc[Ka];
int nj = nmo[Kj];
T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
}
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -77,30 +80,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
{
int Ka = blockIdx.x;
int Kj = blockIdx.y;
if (Ka >= nkpts || Kj >= nkpts)
int pol = blockIdx.z;
if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
return;
int na0 = nocc0[Ka];
int nj0 = nmo0[Kj];
int na = nocc[Ka];
int nj = nmo[Kj];
thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
}
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -114,8 +119,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
B);
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@ -123,6 +128,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -136,8 +142,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
B);
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@ -145,6 +151,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -158,8 +165,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
B);
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@ -167,6 +174,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -180,8 +188,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<double>*>(B));
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
@ -190,6 +198,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -203,8 +212,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
reinterpret_cast<thrust::complex<float> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
@ -213,6 +222,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -226,8 +236,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");

View File

@ -23,6 +23,7 @@ namespace kernels
{
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -34,6 +35,7 @@ void KaKjw_to_KKwaj(int nwalk,
double* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -45,6 +47,7 @@ void KaKjw_to_KKwaj(int nwalk,
float* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -56,6 +59,7 @@ void KaKjw_to_KKwaj(int nwalk,
float* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -67,6 +71,7 @@ void KaKjw_to_KKwaj(int nwalk,
std::complex<double>* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -78,6 +83,7 @@ void KaKjw_to_KKwaj(int nwalk,
std::complex<float>* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,

View File

@ -29,6 +29,7 @@ namespace kernels
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -42,7 +43,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
{
int Q = blockIdx.x;
int K = blockIdx.y;
if (Q >= nkpts || K >= nkpts)
int pol = blockIdx.z;
if (Q >= nkpts || K >= nkpts || pol > npol)
return;
int QK = QKtok2[Q * nkpts + K];
int na0 = nocc0[K];
@ -50,15 +52,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
int na = nocc[K];
int nj = nmo[QK];
T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk;
a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
B_[a0 + j * nwalk + n] = static_cast<T2>(A_[a1 + j * nwalk + n]);
@ -67,6 +70,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -80,7 +84,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
{
int Q = blockIdx.x;
int K = blockIdx.y;
if (Q >= nkpts || K >= nkpts)
int pol = blockIdx.z;
if (Q >= nkpts || K >= nkpts || pol > npol)
return;
int QK = QKtok2[Q * nkpts + K];
int na0 = nocc0[K];
@ -88,15 +93,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
int na = nocc[K];
int nj = nmo[QK];
thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk;
a < na; a++, a0 += npol*nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
{
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
@ -106,6 +112,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -120,8 +127,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
QKtok2, A, B);
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@ -129,6 +136,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -143,8 +151,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
QKtok2, A, B);
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@ -152,6 +160,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -166,8 +175,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
QKtok2, A, B);
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@ -175,6 +184,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -189,8 +199,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
QKtok2, reinterpret_cast<thrust::complex<float> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
@ -199,6 +209,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -213,8 +224,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<double>*>(B));
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
@ -223,6 +234,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -237,8 +249,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
dim3 grid_dim(nkpts, nkpts, npol);
kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");

View File

@ -23,6 +23,7 @@ namespace kernels
{
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -35,6 +36,7 @@ void KaKjw_to_QKajw(int nwalk,
double* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -47,6 +49,7 @@ void KaKjw_to_QKajw(int nwalk,
float* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -59,6 +62,7 @@ void KaKjw_to_QKajw(int nwalk,
float* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -71,6 +75,7 @@ void KaKjw_to_QKajw(int nwalk,
std::complex<float>* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -83,6 +88,7 @@ void KaKjw_to_QKajw(int nwalk,
std::complex<double>* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,

View File

@ -25,6 +25,7 @@ namespace kernels
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -37,30 +38,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
{
int Ka = blockIdx.x;
int Kj = blockIdx.y;
if (Ka >= nkpts || Kj >= nkpts)
int pol = blockIdx.z;
if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
return;
int na0 = nocc0[Ka];
int nj0 = nmo0[Kj];
int na = nocc[Ka];
int nj = nmo[Kj];
T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
}
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -73,30 +76,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
{
int Ka = blockIdx.x;
int Kj = blockIdx.y;
if (Ka >= nkpts || Kj >= nkpts)
int pol = blockIdx.z;
if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
return;
int na0 = nocc0[Ka];
int nj0 = nmo0[Kj];
int na = nocc[Ka];
int nj = nmo[Kj];
thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
}
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -110,8 +115,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, A, B);
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@ -119,6 +124,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -132,8 +138,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, A, B);
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@ -141,6 +147,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -154,8 +161,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, A, B);
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@ -163,6 +170,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -176,8 +184,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<double>*>(B));
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
@ -186,6 +194,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -199,8 +208,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<float> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
@ -209,6 +218,7 @@ void KaKjw_to_KKwaj(int nwalk,
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -222,8 +232,8 @@ void KaKjw_to_KKwaj(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");

View File

@ -20,6 +20,7 @@ namespace kernels
{
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -31,6 +32,7 @@ void KaKjw_to_KKwaj(int nwalk,
double* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -42,6 +44,7 @@ void KaKjw_to_KKwaj(int nwalk,
float* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -53,6 +56,7 @@ void KaKjw_to_KKwaj(int nwalk,
float* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -64,6 +68,7 @@ void KaKjw_to_KKwaj(int nwalk,
std::complex<double>* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -75,6 +80,7 @@ void KaKjw_to_KKwaj(int nwalk,
std::complex<float>* B);
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,

View File

@ -26,6 +26,7 @@ namespace kernels
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -39,7 +40,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
{
int Q = blockIdx.x;
int K = blockIdx.y;
if (Q >= nkpts || K >= nkpts)
int pol = blockIdx.z;
if (Q >= nkpts || K >= nkpts || pol > npol)
return;
int QK = QKtok2[Q * nkpts + K];
int na0 = nocc0[K];
@ -47,15 +49,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
int na = nocc[K];
int nj = nmo[QK];
T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk;
a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
B_[a0 + j * nwalk + n] = static_cast<T2>(A_[a1 + j * nwalk + n]);
@ -64,6 +67,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
template<typename T, typename T2>
__global__ void kernel_KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -77,7 +81,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
{
int Q = blockIdx.x;
int K = blockIdx.y;
if (Q >= nkpts || K >= nkpts)
int pol = blockIdx.z;
if (Q >= nkpts || K >= nkpts || pol > npol)
return;
int QK = QKtok2[Q * nkpts + K];
int na0 = nocc0[K];
@ -85,15 +90,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
int na = nocc[K];
int nj = nmo[QK];
thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
thrust::complex<T2>* B_(B + ((Q * nkpts + K) * npol * nocc_max) * nmo_max * nwalk);
if (threadIdx.x >= nj)
return;
if (threadIdx.y >= nwalk)
return;
for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk;
a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
{
for (int j = threadIdx.x; j < nj; j += blockDim.x)
for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
@ -103,6 +109,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -117,8 +124,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@ -126,6 +133,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -140,8 +148,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@ -149,6 +157,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -163,8 +172,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@ -172,6 +181,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -186,8 +196,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<float> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
@ -196,6 +206,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -210,8 +221,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<double>*>(B));
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
@ -220,6 +231,7 @@ void KaKjw_to_QKajw(int nwalk,
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -234,8 +246,8 @@ void KaKjw_to_QKajw(int nwalk,
int xblock_dim = 16;
int yblock_dim = std::min(nwalk, 32);
dim3 block_dim(xblock_dim, yblock_dim, 1);
dim3 grid_dim(nkpts, nkpts, 1);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
dim3 grid_dim(nkpts, nkpts, npol);
hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
reinterpret_cast<thrust::complex<float>*>(B));
qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");

View File

@ -20,6 +20,7 @@ namespace kernels
{
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -32,6 +33,7 @@ void KaKjw_to_QKajw(int nwalk,
double* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -44,6 +46,7 @@ void KaKjw_to_QKajw(int nwalk,
float* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -56,6 +59,7 @@ void KaKjw_to_QKajw(int nwalk,
float* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -68,6 +72,7 @@ void KaKjw_to_QKajw(int nwalk,
std::complex<float>* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -80,6 +85,7 @@ void KaKjw_to_QKajw(int nwalk,
std::complex<double>* B);
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,

View File

@ -29,6 +29,7 @@ namespace ma
template<typename Q, typename T>
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -40,7 +41,7 @@ void KaKjw_to_KKwaj(int nwalk,
T* B)
{
// OpenMP: Combine Ka,Kj loops into single loop and call parallel for
int naj = nocc_max * nmo_max;
int napj = nocc_max * npol * nmo_max;
int na0 = 0;
for (int Ka = 0; Ka < nkpts; Ka++)
{
@ -50,16 +51,19 @@ void KaKjw_to_KKwaj(int nwalk,
{
int nj = nopk[Kj];
//auto G_(to_address(GKK[0][Ka][Kj].origin()));
auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * nmo_max);
auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * npol * nmo_max);
for (int a = 0; a < na; a++)
{
//auto Gc_( to_address(Gca[na0+a][nj0].origin()) );
auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk);
int aj = a * nmo_max;
for (int j = 0; j < nj; j++, aj++)
//auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) );
int apj = a * npol * nmo_max;
for (int p = 0; p < npol; p++)
{
for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj)
G_[waj + aj] = static_cast<T>(*Gc_);
auto Gc_(A + ((na0 + a) * npol + p ) * nmo_tot * nwalk + nj0 * nwalk);
for (int j = 0; j < nj; j++, apj++)
{
for (int w = 0, wapj = 0; w < nwalk; w++, ++Gc_, wapj += napj)
G_[wapj + apj] = static_cast<T>(*Gc_);
}
}
}
nj0 += nj;
@ -71,6 +75,7 @@ void KaKjw_to_KKwaj(int nwalk,
template<typename T, typename T1>
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -94,16 +99,19 @@ void KaKjw_to_QKajw(int nwalk,
int na0 = nocc0[Ka];
int nj0 = nmo0[Kj];
//auto G_(to_address(GKK[Q][K].origin()));
auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * nmo_max);
for (int a = 0, a0 = 0; a < na; a++, a0 += nmo_max * nwalk)
auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * npol * nmo_max);
for (int a = 0, a0 = 0; a < na; a++)
{
//auto Gc_( to_address(Gca[na0+a][nj0].origin()) );
auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk);
for (int j = 0, aj = a0; j < nj; j++, aj += nwalk)
for (int p = 0; p < npol; p++, a0 += nmo_max * nwalk)
{
for (int w = 0; w < nwalk; w++, ++Gc_)
//auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) );
auto Gc_(A + ((na0 + a) * npol + p) * nmo_tot * nwalk + nj0 * nwalk);
for (int j = 0, apj = a0; j < nj; j++, apj += nwalk)
{
G_[aj + w] = static_cast<T>(*Gc_);
for (int w = 0; w < nwalk; w++, ++Gc_)
{
G_[apj + w] = static_cast<T>(*Gc_);
}
}
}
}
@ -275,6 +283,7 @@ namespace device
template<typename T, typename Q>
void KaKjw_to_KKwaj(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -285,13 +294,14 @@ void KaKjw_to_KKwaj(int nwalk,
device_pointer<Q> A,
device_pointer<T> B)
{
kernels::KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
kernels::KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
to_address(nocc0), to_address(A), to_address(B));
}
template<typename T, typename Q>
void KaKjw_to_QKajw(int nwalk,
int nkpts,
int npol,
int nmo_max,
int nmo_tot,
int nocc_max,
@ -303,7 +313,7 @@ void KaKjw_to_QKajw(int nwalk,
device_pointer<Q> A,
device_pointer<T> B)
{
kernels::KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
kernels::KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
to_address(nocc0), to_address(QKtok2), to_address(A), to_address(B));
}

View File

@ -100,7 +100,7 @@ TEST_CASE("KaKjw_to_KKwaj", "[Numerics][tensor_operations]")
// KaKjw = numpy.arange(nk*na*nk*nj*nw).reshape((nk,na,nk,nj,nw))
// KKwaj = numpy.transpose(KaKjw, (0,2,1,3,4))
using ma::KaKjw_to_KKwaj;
KaKjw_to_KKwaj(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
KaKjw_to_KKwaj(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
nel_pk0.origin(), KaKjw.origin(), KKwaj.origin());
// Reference values from python
REQUIRE(KKwaj[1][2][3][4][4] == Approx(1473.0));
@ -127,7 +127,7 @@ TEST_CASE("KaKjw_to_QKwaj", "[Numerics][tensor_operations]")
Tensor2D<int> qk_to_k2 = {{0, 1, 2, 3}, {1, 0, 3, 2}, {2, 3, 0, 1}, {3, 2, 1, 0}};
copy_n(buffer.data(), buffer.size(), KaKjw.origin());
using ma::KaKjw_to_QKajw;
KaKjw_to_QKajw(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
KaKjw_to_QKajw(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
nel_pk0.origin(), qk_to_k2.origin(), KaKjw.origin(), QKajw.origin());
// Just captured from output.
REQUIRE(QKajw[1][2][3][4][4] == Approx(1904.0));

View File

@ -180,7 +180,7 @@ public:
TTensor_ref TMN_( TMN.origin(), {npol*nbatch, M, NAEA});
TTensor_ref T1_( T1.origin(), {npol*nbatch, M, NAEA});
TTensor_ref T2_( T2.origin(), {npol*nbatch, M, NAEA});
SlaterDeterminantOperations::batched::apply_expM(V, TMN_, T1_, T2_, order, TA);
SlaterDeterminantOperations::batched::apply_expM_noncollinear(V, TMN_, T1_, T2_, order, TA);
} else {
SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA);
}

View File

@ -188,6 +188,90 @@ inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2,
}
}
/*
* Calculate S = exp(im*V)*S using a Taylor expansion of exp(V)
* Version for non_collinear calculations, where there are 2 S matrices per V in the batch.
*/
template<class MatA, class MatB, class MatC>
inline void apply_expM_noncollinear(const MatA& V, MatB&& S, MatC& T1, MatC& T2,
int order = 6, char TA = 'N')
{
static_assert(std::decay<MatA>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
static_assert(std::decay<MatB>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
static_assert(std::decay<MatC>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
assert(V.size(0)*2 == S.size(0));
assert(V.size(0)*2 == T1.size(0));
assert(V.size(0)*2 == T2.size(0));
assert(V.size(1) == V.size(2));
assert(V.size(2) == S.size(1));
assert(S.size(1) == T1.size(1));
assert(S.size(2) == T1.size(2));
assert(S.size(1) == T2.size(1));
assert(S.size(2) == T2.size(2));
// for now limit to continuous
assert(S.stride(0) == S.size(1) * S.size(2));
assert(T1.stride(0) == T1.size(1) * T1.size(2));
assert(T2.stride(0) == T2.size(1) * T2.size(2));
assert(S.stride(1) == S.size(2));
assert(T1.stride(1) == T1.size(2));
assert(T2.stride(1) == T2.size(2));
assert(S.stride(2) == 1);
assert(T1.stride(2) == 1);
assert(T2.stride(2) == 1);
using ComplexType = typename std::decay<MatB>::type::element;
ComplexType zero(0.);
ComplexType im(0.0, 1.0);
if (TA == 'H' || TA == 'h')
im = ComplexType(0.0, -1.0);
auto pT1(std::addressof(T1));
auto pT2(std::addressof(T2));
using pointerA = typename std::decay<MatA>::type::element_const_ptr;
using pointerC = typename std::decay<MatC>::type::element_ptr;
int nbatch = S.size(0);
int ldv = V.stride(1);
int M = T2.size(2);
int N = T2.size(1);
int K = T1.size(1);
std::vector<pointerA> Vi;
std::vector<pointerC> T1i;
std::vector<pointerC> T2i;
Vi.reserve(2*V.size(0));
T1i.reserve(T1.size(0));
T2i.reserve(T2.size(0));
for (int i = 0; i < V.size(0); i++) {
Vi.emplace_back(ma::pointer_dispatch(V[i].origin()));
Vi.emplace_back(ma::pointer_dispatch(V[i].origin()));
}
for (int i = 0; i < T1.size(0); i++)
T1i.emplace_back(ma::pointer_dispatch(T1[i].origin()));
for (int i = 0; i < T2.size(0); i++)
T2i.emplace_back(ma::pointer_dispatch(T2[i].origin()));
auto pT1i(std::addressof(T1i));
auto pT2i(std::addressof(T2i));
// getting around issue in multi, fix later
//T1 = S;
using std::copy_n;
copy_n(S.origin(), S.num_elements(), T1.origin());
for (int n = 1; n <= order; n++)
{
ComplexType fact = im * static_cast<ComplexType>(1.0 / static_cast<double>(n));
using ma::gemmBatched;
// careful with fortran ordering
gemmBatched('N',TA,M,N,K,fact, pT1i->data(), (*pT1).stride(1), Vi.data(), ldv,
zero, pT2i->data(), (*pT2).stride(1), nbatch);
using ma::axpy;
axpy(S.num_elements(), ComplexType(1.0), (*pT2).origin(), 1, S.origin(), 1);
std::swap(pT1, pT2);
std::swap(pT1i, pT2i);
}
}
} // namespace batched
} // namespace SlaterDeterminantOperations

View File

@ -779,7 +779,7 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
// Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
for (int k = 0; k < nmo; k++)
{
ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bnas);
ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aas_i), bnas);
for (int a = 0; a < na; a++)
for (int p = 0; p < npol; p++)
Lakn[a][p*nmo+k] = bnas({0, nchol}, a*npol+p);