From cc45a76609a2a870f157121821962525efac9fed Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Wed, 16 Sep 2020 09:39:44 -0700 Subject: [PATCH] fixed issues on gpu with recent noncollinear changes, all unit tests pass non-collinear KP input --- .../boost_multi/multi/memory/allocator.hpp | 3 +- .../KP3IndexFactorization_batched.hpp | 147 ++++++++++++------ .../Hamiltonians/KPFactorizedHamiltonian.cpp | 4 +- .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cu | 58 ++++--- .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh | 6 + .../detail/CUDA/Kernels/KaKjw_to_QKajw.cu | 52 ++++--- .../detail/CUDA/Kernels/KaKjw_to_QKajw.cuh | 6 + .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp | 56 ++++--- .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h | 6 + .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp | 52 ++++--- .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.h | 6 + src/AFQMC/Numerics/tensor_operations.hpp | 44 ++++-- .../Numerics/tests/test_tensor_operations.cpp | 4 +- .../SlaterDetOperations_serial.hpp | 2 +- .../apply_expM.hpp | 84 ++++++++++ .../SlaterDeterminantOperations/rotate.hpp | 2 +- 16 files changed, 374 insertions(+), 158 deletions(-) diff --git a/external_codes/boost_multi/multi/memory/allocator.hpp b/external_codes/boost_multi/multi/memory/allocator.hpp index d10f9d86f..2e93733cb 100644 --- a/external_codes/boost_multi/multi/memory/allocator.hpp +++ b/external_codes/boost_multi/multi/memory/allocator.hpp @@ -49,7 +49,8 @@ public: bool operator!=(allocator const& o) const{return mp_ != o.mp_;} using void_pointer = typename std::pointer_traits()->allocate(0, 0))>::template rebind; pointer allocate(size_type n){ - return static_cast(static_cast(mp_->allocate(n*sizeof(value_type), alignof(T)))); + return static_cast(static_cast(mp_->allocate(n*sizeof(value_type), 16))); + //return static_cast(static_cast(mp_->allocate(n*sizeof(value_type), alignof(T)))); } void deallocate(pointer p, size_type n){ mp_->deallocate(p, n*sizeof(value_type)); diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp index 270071a36..5d20c4178 100644 --- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp +++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp @@ -82,6 +82,8 @@ class KP3IndexFactorization_batched using CMatrix_cref = boost::multi::array_ref; using CVector_ref = ComplexVector_ref; using CMatrix_ref = ComplexMatrix_ref; + using C3Tensor_ref = Complex3Tensor_ref; + using C4Tensor_ref = ComplexArray_ref<4, pointer>; using C3Tensor_cref = boost::multi::array_ref; using SpMatrix_cref = boost::multi::array_ref; @@ -338,50 +340,96 @@ public: { int nkpts = nopk.size(); int NMO = std::accumulate(nopk.begin(), nopk.end(), 0); - // in non-collinear case with SO, keep SO matrix here and add it - // for now, stay collinear + int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; CVector vMF_(vMF); - CVector P1D(iextensions<1u>{NMO * NMO}); - fill_n(P1D.origin(), P1D.num_elements(), ComplexType(0)); - vHS(vMF_, P1D); + CVector P0D(iextensions<1u>{NMO * NMO}); + fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0)); + vHS(vMF_, P0D); if (TG_.TG().size() > 1) - TG_.TG().all_reduce_in_place_n(to_address(P1D.origin()), P1D.num_elements(), std::plus<>()); + TG_.TG().all_reduce_in_place_n(to_address(P0D.origin()), P0D.num_elements(), std::plus<>()); - boost::multi::array P1({NMO, NMO}); - copy_n(P1D.origin(), NMO * NMO, P1.origin()); + boost::multi::array P0({NMO, NMO}); + copy_n(P0D.origin(), NMO * NMO, P0.origin()); - // add H1 + vn0 and symmetrize - using ma::conj; + boost::multi::array P1({npol*NMO, npol*NMO}); + std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0)); + // add spin-dependent H1 + for (int K = 0, nk0 = 0; K < nkpts; ++K) + { + for (int i = 0, I = nk0; i < nopk[K]; i++, I++) + { + for(int p=0; p 1e-5) + for(int p=0; p 1e-6) - { -#endif - app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; - app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << " " << H1[K][i][j] << " " - << H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl; - //APP_ABORT("Error in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"); + P1[p*NMO+I][p*NMO+J] += vn0[K][i][j]; + P1[p*NMO+J][p*NMO+I] += vn0[K][j][i]; } - P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I])); - P1[J][I] = ma::conj(P1[I][J]); } } nk0 += nopk[K]; } + + using ma::conj; + // symmetrize + for(int I=0; I 1e-5) + { +#else + if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6) + { +#endif + app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; + app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] <template get_allocator()); GKaKjw_to_GKKwaj(G3Da, GKK[0], nelpk[nd].sliced(0, nkpts), dev_nelpk[nd], dev_a0pk[nd]); if (walker_type == COLLINEAR) @@ -519,10 +568,11 @@ public: for (int K = 0; K < nkpts; ++K) { #if defined(MIXED_PRECISION) - CMatrix_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, nmo_max}); + C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max}); for (int a = 0; a < nelpk[nd][K]; ++a) - ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a].sliced(0, nopk[K]), - ComplexType(1.), E({0, nwalk}, 0)); + for (int pol = 0; pol < npol; ++pol) + ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+p].sliced(nk, nk + nopk[K])), + haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0)); na += nelpk[nd][K]; if (walker_type == COLLINEAR) { @@ -538,8 +588,8 @@ public: nk = nopk[K]; { na = nelpk[nd][K]; - CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * nmo_max}); - SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * nmo_max}); + CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * npol * nmo_max}); + SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * npol * nmo_max}); ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E({0, nwalk}, 0)); } if (walker_type == COLLINEAR) @@ -582,7 +632,7 @@ public: // I WANT C++17!!!!!! long mem_ank(0); if (needs_copy) - mem_ank = nkpts * nocc_max * nchol_max * nmo_max; + mem_ank = nkpts * nocc_max * nchol_max * npol * nmo_max; StaticVector LBuff(iextensions<1u>{2 * mem_ank}, device_buffer_allocator->template get_allocator()); sp_pointer LQptr(nullptr), LQmptr(nullptr); @@ -660,8 +710,8 @@ public: if (batch_cnt >= batch_size) { - gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0), - Aarray.data(), nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(), + gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0), + Aarray.data(), npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max, Aarray.size()); copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin()); @@ -691,8 +741,8 @@ public: if (batch_cnt > 0) { - gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0), Aarray.data(), - nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max, + gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0), Aarray.data(), + npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max, Aarray.size()); copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin()); @@ -1322,6 +1372,7 @@ public: assert(v.size(0) == 2 * local_nCV); assert(v.size(1) == nwalk); int nspin = (walker_type == COLLINEAR ? 2 : 1); + int npol = (walker_type == NONCOLLINEAR ? 2 : 1); int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); int nmo_max = *std::max_element(nopk.begin(), nopk.end()); int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0); @@ -1340,11 +1391,11 @@ public: SPComplexType minusimhalfa(0.0, -0.5 * a * scl); SPComplexType imhalfa(0.0, 0.5 * a * scl); - assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot); + assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot); // MAM: use reshape when available, then no need to deal with types using GType = typename std::decay::type::element; boost::multi::array_ref G3Da(make_device_ptr(G.origin()), - {nocca_tot, nmo_tot, nwalk}); + {nocca_tot*npol, nmo_tot, nwalk}); boost::multi::array_ref G3Db(make_device_ptr(G.origin()) + G3Da.num_elements() * (nspin - 1), @@ -1358,7 +1409,7 @@ public: size_t cnt(0); Static3Tensor v1({nkpts + number_of_symmetric_Q, nchol_max, nwalk}, device_buffer_allocator->template get_allocator()); - Static3Tensor GQ({nkpts, nkpts * nocc_max * nmo_max, nwalk}, + Static3Tensor GQ({nkpts, nkpts * nocc_max * npol * nmo_max, nwalk}, device_buffer_allocator->template get_allocator()); fill_n(v1.origin(), v1.num_elements(), SPComplexType(0.0)); fill_n(GQ.origin(), GQ.num_elements(), SPComplexType(0.0)); @@ -1370,7 +1421,7 @@ public: dev_a0pk[nd].sliced(nkpts, 2 * nkpts)); // can use productStridedBatched if LQKakn is changed to a 3Tensor array - int Kak = nkpts * nocc_max * nmo_max; + int Kak = nkpts * nocc_max * npol * nmo_max; std::vector Aarray; std::vector Barray; std::vector Carray; @@ -1530,30 +1581,32 @@ private: template void GKaKjw_to_GKKwaj(MatA const& GKaKj, MatB&& GKKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0) { + int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; int nmo_max = *std::max_element(nopk.begin(), nopk.end()); // int nocc_max = *std::max_element(nocc.begin(),nocc.end()); int nmo_tot = GKaKj.size(1); int nwalk = GKaKj.size(2); int nkpts = nopk.size(); - assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max); + assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max); using ma::KaKjw_to_KKwaj; - KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), + KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), dev_a0.origin(), GKaKj.origin(), GKKaj.origin()); } template void GKaKjw_to_GQKajw(MatA const& GKaKj, MatB&& GQKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0) { + int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; int nmo_max = *std::max_element(nopk.begin(), nopk.end()); // int nocc_max = *std::max_element(nocc.begin(),nocc.end()); int nmo_tot = GKaKj.size(1); int nwalk = GKaKj.size(2); int nkpts = nopk.size(); - assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max); + assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max); using ma::KaKjw_to_QKajw; - KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), + KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), dev_a0.origin(), dev_QKToK2.origin(), GKaKj.origin(), GQKaj.origin()); } diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp index 27b9a7ee7..ff281cd79 100644 --- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp +++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp @@ -1143,7 +1143,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( for (int nd = 0; nd < ndet; nd++) { for (int Q = 0; Q < number_of_symmetric_Q; Q++) - LQKbnl.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{distNode})); + LQKbnl.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator{distNode})); if (type == COLLINEAR) { for (int Q = 0; Q < number_of_symmetric_Q; Q++) @@ -1161,7 +1161,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( { for (int Q = 0; Q < number_of_symmetric_Q; Q++) { - LQKbln.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{distNode})); + LQKbln.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator{distNode})); } if (type == COLLINEAR) { diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu index acdc06c65..45c56c7c9 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu @@ -24,11 +24,12 @@ namespace kernels { // very sloppy, needs improvement!!!! -// A[nocc_tot][nmo_tot][nwalk] -// B[Ka][Kj][nwalk][nocc_max][nmo_max] +// A[nocc_tot][s][nmo_tot][nwalk] +// B[Ka][Kj][nwalk][nocc_max][s][nmo_max] template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -41,30 +42,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); } template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -77,30 +80,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); } void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -114,8 +119,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -123,6 +128,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -136,8 +142,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -145,6 +151,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -158,8 +165,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -167,6 +174,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -180,8 +188,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); @@ -190,6 +198,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -203,8 +212,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); @@ -213,6 +222,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -226,8 +236,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh index 055353083..cb2ee706b 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh @@ -23,6 +23,7 @@ namespace kernels { void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -34,6 +35,7 @@ void KaKjw_to_KKwaj(int nwalk, double* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -45,6 +47,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -56,6 +59,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -67,6 +71,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -78,6 +83,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu index 92f84e167..f6527b0e7 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu @@ -29,6 +29,7 @@ namespace kernels template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -42,7 +43,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -50,15 +52,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) B_[a0 + j * nwalk + n] = static_cast(A_[a1 + j * nwalk + n]); @@ -67,6 +70,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -80,7 +84,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -88,15 +93,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol*nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) { for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) @@ -106,6 +112,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -120,8 +127,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -129,6 +136,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -143,8 +151,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -152,6 +160,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -166,8 +175,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -175,6 +184,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -189,8 +199,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); @@ -199,6 +209,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -213,8 +224,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); @@ -223,6 +234,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -237,8 +249,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh index 164e6c598..0315a29b9 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh @@ -23,6 +23,7 @@ namespace kernels { void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -35,6 +36,7 @@ void KaKjw_to_QKajw(int nwalk, double* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -47,6 +49,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -59,6 +62,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -71,6 +75,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -83,6 +88,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp index b4bd304ba..0bbcd0157 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp @@ -25,6 +25,7 @@ namespace kernels template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -37,30 +38,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); } template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -73,30 +76,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); } void KaKjw_to_KKwaj(int nwalk, - int nkpts, + int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -110,8 +115,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -119,6 +124,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -132,8 +138,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -141,6 +147,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -154,8 +161,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -163,6 +170,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -176,8 +184,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); @@ -186,6 +194,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -199,8 +208,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); @@ -209,6 +218,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -222,8 +232,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h index 1fa43f492..ebab86b49 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h @@ -20,6 +20,7 @@ namespace kernels { void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -31,6 +32,7 @@ void KaKjw_to_KKwaj(int nwalk, double* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -42,6 +44,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -53,6 +56,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -64,6 +68,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -75,6 +80,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp index 9b9ca77cc..6fd46e0bc 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp @@ -26,6 +26,7 @@ namespace kernels template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -39,7 +40,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -47,15 +49,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) B_[a0 + j * nwalk + n] = static_cast(A_[a1 + j * nwalk + n]); @@ -64,6 +67,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -77,7 +81,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -85,15 +90,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Q * nkpts + K) * npol * nocc_max) * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) { for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) @@ -103,6 +109,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -117,8 +124,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -126,6 +133,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -140,8 +148,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -149,6 +157,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -163,8 +172,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -172,6 +181,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -186,8 +196,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); @@ -196,6 +206,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -210,8 +221,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); @@ -220,6 +231,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -234,8 +246,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h index e773dce13..c8b84a021 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h @@ -20,6 +20,7 @@ namespace kernels { void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -32,6 +33,7 @@ void KaKjw_to_QKajw(int nwalk, double* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -44,6 +46,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -56,6 +59,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -68,6 +72,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -80,6 +85,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/tensor_operations.hpp b/src/AFQMC/Numerics/tensor_operations.hpp index f4ea13191..e93757a64 100644 --- a/src/AFQMC/Numerics/tensor_operations.hpp +++ b/src/AFQMC/Numerics/tensor_operations.hpp @@ -29,6 +29,7 @@ namespace ma template void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -40,7 +41,7 @@ void KaKjw_to_KKwaj(int nwalk, T* B) { // OpenMP: Combine Ka,Kj loops into single loop and call parallel for - int naj = nocc_max * nmo_max; + int napj = nocc_max * npol * nmo_max; int na0 = 0; for (int Ka = 0; Ka < nkpts; Ka++) { @@ -50,16 +51,19 @@ void KaKjw_to_KKwaj(int nwalk, { int nj = nopk[Kj]; //auto G_(to_address(GKK[0][Ka][Kj].origin())); - auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * nmo_max); + auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * npol * nmo_max); for (int a = 0; a < na; a++) { - //auto Gc_( to_address(Gca[na0+a][nj0].origin()) ); - auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk); - int aj = a * nmo_max; - for (int j = 0; j < nj; j++, aj++) + //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) ); + int apj = a * npol * nmo_max; + for (int p = 0; p < npol; p++) { - for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj) - G_[waj + aj] = static_cast(*Gc_); + auto Gc_(A + ((na0 + a) * npol + p ) * nmo_tot * nwalk + nj0 * nwalk); + for (int j = 0; j < nj; j++, apj++) + { + for (int w = 0, wapj = 0; w < nwalk; w++, ++Gc_, wapj += napj) + G_[wapj + apj] = static_cast(*Gc_); + } } } nj0 += nj; @@ -71,6 +75,7 @@ void KaKjw_to_KKwaj(int nwalk, template void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -94,16 +99,19 @@ void KaKjw_to_QKajw(int nwalk, int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; //auto G_(to_address(GKK[Q][K].origin())); - auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * nmo_max); - for (int a = 0, a0 = 0; a < na; a++, a0 += nmo_max * nwalk) + auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * npol * nmo_max); + for (int a = 0, a0 = 0; a < na; a++) { - //auto Gc_( to_address(Gca[na0+a][nj0].origin()) ); - auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk); - for (int j = 0, aj = a0; j < nj; j++, aj += nwalk) + for (int p = 0; p < npol; p++, a0 += nmo_max * nwalk) { - for (int w = 0; w < nwalk; w++, ++Gc_) + //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) ); + auto Gc_(A + ((na0 + a) * npol + p) * nmo_tot * nwalk + nj0 * nwalk); + for (int j = 0, apj = a0; j < nj; j++, apj += nwalk) { - G_[aj + w] = static_cast(*Gc_); + for (int w = 0; w < nwalk; w++, ++Gc_) + { + G_[apj + w] = static_cast(*Gc_); + } } } } @@ -275,6 +283,7 @@ namespace device template void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -285,13 +294,14 @@ void KaKjw_to_KKwaj(int nwalk, device_pointer A, device_pointer B) { - kernels::KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), + kernels::KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), to_address(nocc0), to_address(A), to_address(B)); } template void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -303,7 +313,7 @@ void KaKjw_to_QKajw(int nwalk, device_pointer A, device_pointer B) { - kernels::KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), + kernels::KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), to_address(nocc0), to_address(QKtok2), to_address(A), to_address(B)); } diff --git a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp index 8b54c4124..b527aee87 100644 --- a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp +++ b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp @@ -100,7 +100,7 @@ TEST_CASE("KaKjw_to_KKwaj", "[Numerics][tensor_operations]") // KaKjw = numpy.arange(nk*na*nk*nj*nw).reshape((nk,na,nk,nj,nw)) // KKwaj = numpy.transpose(KaKjw, (0,2,1,3,4)) using ma::KaKjw_to_KKwaj; - KaKjw_to_KKwaj(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), + KaKjw_to_KKwaj(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), nel_pk0.origin(), KaKjw.origin(), KKwaj.origin()); // Reference values from python REQUIRE(KKwaj[1][2][3][4][4] == Approx(1473.0)); @@ -127,7 +127,7 @@ TEST_CASE("KaKjw_to_QKwaj", "[Numerics][tensor_operations]") Tensor2D qk_to_k2 = {{0, 1, 2, 3}, {1, 0, 3, 2}, {2, 3, 0, 1}, {3, 2, 1, 0}}; copy_n(buffer.data(), buffer.size(), KaKjw.origin()); using ma::KaKjw_to_QKajw; - KaKjw_to_QKajw(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), + KaKjw_to_QKajw(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), nel_pk0.origin(), qk_to_k2.origin(), KaKjw.origin(), QKajw.origin()); // Just captured from output. REQUIRE(QKajw[1][2][3][4][4] == Approx(1904.0)); diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp index 73ff2e0ba..bc43967aa 100644 --- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp @@ -180,7 +180,7 @@ public: TTensor_ref TMN_( TMN.origin(), {npol*nbatch, M, NAEA}); TTensor_ref T1_( T1.origin(), {npol*nbatch, M, NAEA}); TTensor_ref T2_( T2.origin(), {npol*nbatch, M, NAEA}); - SlaterDeterminantOperations::batched::apply_expM(V, TMN_, T1_, T2_, order, TA); + SlaterDeterminantOperations::batched::apply_expM_noncollinear(V, TMN_, T1_, T2_, order, TA); } else { SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA); } diff --git a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp index ea1dd664b..8a706c811 100644 --- a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp @@ -188,6 +188,90 @@ inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2, } } +/* + * Calculate S = exp(im*V)*S using a Taylor expansion of exp(V) + * Version for non_collinear calculations, where there are 2 S matrices per V in the batch. + */ +template +inline void apply_expM_noncollinear(const MatA& V, MatB&& S, MatC& T1, MatC& T2, + int order = 6, char TA = 'N') +{ + static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); + static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); + static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); + assert(V.size(0)*2 == S.size(0)); + assert(V.size(0)*2 == T1.size(0)); + assert(V.size(0)*2 == T2.size(0)); + assert(V.size(1) == V.size(2)); + assert(V.size(2) == S.size(1)); + assert(S.size(1) == T1.size(1)); + assert(S.size(2) == T1.size(2)); + assert(S.size(1) == T2.size(1)); + assert(S.size(2) == T2.size(2)); + // for now limit to continuous + assert(S.stride(0) == S.size(1) * S.size(2)); + assert(T1.stride(0) == T1.size(1) * T1.size(2)); + assert(T2.stride(0) == T2.size(1) * T2.size(2)); + assert(S.stride(1) == S.size(2)); + assert(T1.stride(1) == T1.size(2)); + assert(T2.stride(1) == T2.size(2)); + assert(S.stride(2) == 1); + assert(T1.stride(2) == 1); + assert(T2.stride(2) == 1); + + using ComplexType = typename std::decay::type::element; + ComplexType zero(0.); + ComplexType im(0.0, 1.0); + if (TA == 'H' || TA == 'h') + im = ComplexType(0.0, -1.0); + auto pT1(std::addressof(T1)); + auto pT2(std::addressof(T2)); + + using pointerA = typename std::decay::type::element_const_ptr; + using pointerC = typename std::decay::type::element_ptr; + + int nbatch = S.size(0); + int ldv = V.stride(1); + int M = T2.size(2); + int N = T2.size(1); + int K = T1.size(1); + + std::vector Vi; + std::vector T1i; + std::vector T2i; + Vi.reserve(2*V.size(0)); + T1i.reserve(T1.size(0)); + T2i.reserve(T2.size(0)); + for (int i = 0; i < V.size(0); i++) { + Vi.emplace_back(ma::pointer_dispatch(V[i].origin())); + Vi.emplace_back(ma::pointer_dispatch(V[i].origin())); + } + for (int i = 0; i < T1.size(0); i++) + T1i.emplace_back(ma::pointer_dispatch(T1[i].origin())); + for (int i = 0; i < T2.size(0); i++) + T2i.emplace_back(ma::pointer_dispatch(T2[i].origin())); + + auto pT1i(std::addressof(T1i)); + auto pT2i(std::addressof(T2i)); + + // getting around issue in multi, fix later + //T1 = S; + using std::copy_n; + copy_n(S.origin(), S.num_elements(), T1.origin()); + for (int n = 1; n <= order; n++) + { + ComplexType fact = im * static_cast(1.0 / static_cast(n)); + using ma::gemmBatched; + // careful with fortran ordering + gemmBatched('N',TA,M,N,K,fact, pT1i->data(), (*pT1).stride(1), Vi.data(), ldv, + zero, pT2i->data(), (*pT2).stride(1), nbatch); + using ma::axpy; + axpy(S.num_elements(), ComplexType(1.0), (*pT2).origin(), 1, S.origin(), 1); + std::swap(pT1, pT2); + std::swap(pT1i, pT2i); + } +} + } // namespace batched } // namespace SlaterDeterminantOperations diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp index cdf85807d..2c04324eb 100644 --- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp @@ -779,7 +779,7 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai, // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n]) for (int k = 0; k < nmo; k++) { - ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bnas); + ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aas_i), bnas); for (int a = 0; a < na; a++) for (int p = 0; p < npol; p++) Lakn[a][p*nmo+k] = bnas({0, nchol}, a*npol+p);